Beispiel #1
0
def change_all_information(db, tb):
	# 读取数据库中的数据
	tableDF = sqlSession.read.jdbc(
		properties={"driver": "com.mysql.jdbc.Driver"}
		, url="jdbc:mysql://192.168.1.228:3307/" + db + "?user=root&password=123", table=tb, lowerBound=1,
		upperBound=10000000, numPartitions=10)
	# 筛选,去除不正确的数值
	aiminfo = tableDF.filter(
		(tableDF['The_duration'] > 0) & (tableDF['The_price'] > 0))
	# 加权处理
	# 保留两位小数 F为 pyspark.sql.function
	recommand1 = F.bround(100 - (aiminfo['The_duration'] / 3000 * 0.9 + aiminfo['The_price'] / 100 * 0.1), scale=2)
	recommand2 = F.bround(100-(aiminfo['The_duration']/3000 * 0.7 + aiminfo['The_price']/100 * 0.3), scale=2)
	recommand3 = F.bround(100 - (aiminfo['The_duration']/3000 * 0.5 + aiminfo['The_price']/100 * 0.5), scale=2)
	recommand4 = F.bround(100 - (aiminfo['The_duration']/3000 * 0.3 + aiminfo['The_price']/100 * 0.7), scale=2)
	recommand5 = F.bround(100 - (aiminfo['The_duration'] / 3000 * 0.1 + aiminfo['The_price'] / 100 * 0.9), scale=2)
	# 添加新的列 recommand
	aiminfo = aiminfo.withColumn("recommand1", recommand1)
	aiminfo = aiminfo.withColumn("recommand2", recommand2)
	aiminfo = aiminfo.withColumn("recommand3", recommand3)
	aiminfo = aiminfo.withColumn("recommand4", recommand4)
	aiminfo = aiminfo.withColumn("recommand5", recommand5)
	# print(aiminfo)

# 将数据覆写新的表,若不存在该表,则创建
	aiminfo.write.jdbc(
	properties={"driver": "com.mysql.jdbc.Driver"},
	url="jdbc:mysql://192.168.1.228:3307/" + db + "?user=root&password=123", table="indirectjourneys_dealed", mode="overwrite")
Beispiel #2
0
def getTrends(df, attrbs, targColumn, targVal):

    targVals = df.groupBy(targColumn).count()
    recordsNo = df.count()
    targVals = targVals.collect()
    targVals = [tuple(x) for x in targVals]
    targVals_counts = dict(targVals)

    df_rows = getTrendRows(df, attrbs, targColumn, targVal, recordsNo,
                           targVals_counts, 0, [])
    trends_cols = [
        'Trend', 'Count of Target Val', 'Support', 'Confidence', 'Lift',
        'Metric'
    ]
    df_req = sqlContext.createDataFrame(df_rows, trends_cols)
    df_req = df_req.withColumn('Support', bround(col('Support'), 3))
    df_req = df_req.withColumn('Confidence', bround(col('Confidence'), 3))
    df_req = df_req.withColumn('Lift', bround(col('Lift'), 3))
    df_req = df_req.withColumn('Metric', bround(col('Metric'), 3))

    return df_req
Beispiel #3
0
def search_words(query, N, tokensTfIdf, df):
    # Splitting the query into separate words
    query_splitted = query.lower().split()

    # Counting number of words in the query
    num_words_query = len(query_splitted)

    # Filtering the tokens containing the query's word
    filtered = tokensTfIdf.filter(
        tokensTfIdf.token.rlike('(^|\\s)(' + '|'.join(query_splitted) +
                                ')(\\s|$)'))

    # counting the number of words that a row/document have in common with query and sum their tf-idf values
    q_sigma_count = filtered.groupby(tokensTfIdf._id).agg(
        f.count('tf_idf').alias('counts'),
        f.sum('tf_idf').alias('sigma'))

    # Calculating the score for each document
    docs_score = q_sigma_count.select(
        '_id', ((q_sigma_count.counts / num_words_query) *
                q_sigma_count.sigma).alias('score'))

    # Retrieve top N maximum scores for the query
    ranked = docs_score.orderBy('score', ascending=False).limit(N)

    # Retrieve text_entry's associated with each high score _id
    search_result = ranked.join(df, df._id == ranked._id).select(
        ranked._id, f.bround(ranked.score, 3),
        'text_entry').orderBy('score', ascending=False).collect()

    # Printing the search result
    def print_search(q, n, output):
        print 'Query: ', q, ',', 'N: ', n
        for i in output:
            print tuple(i)
        print '\n'
df.selectExpr(
    "CustomerId",
    "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity")\
.show(2)
""" =
SELECT 
    customerId,
    (POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity
FROM dfTable
"""

#3 rounding numbers (bround will round down)
print("3")
df.select(
    round(lit("2.5")),
    bround(lit("2.5"))
)\
.show(2)
""" =
SELECT 
    round(2.5),
    bround(2.5)
"""

#4 Statistical Correlation between Quantity and UnitPrice
print("4")
df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice")).show()
"""
SELECT
    corr(Quantity, UnitPrice)
Beispiel #5
0
df.where(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show(5, False)

DOTCodeFilter = col("StockCode") == "DOT"
priceFilter = col("UnitPrice") > 600
descripFilter = instr(col("Description"), "POSTAGE") >= 1
df.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter)) \
    .where("isExpensive").select("unitPrice", "isExpensive").show(5, False)
# 处理数值类型,在Spark中,只需要简单地表达计算方法,并且确保计算表达式对数值类型数据正确可行即可
fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show(5, False)
# 使用SQL表达式实现
df.selectExpr(
    "CustomerId",
    "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2)
# round向上取整,bound向下取整
df.select(bround(lit(2.5)), round(lit(2.5))).show(2)
# 计算两列的相关性
df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice")).show()
# 统计列/一组列的相关性
df.describe().show()
# statFunction封装了很多统计函数
colName = "UnitPrice"
quantileProbs = [0.5]
relError = 0.05
df.stat.approxQuantile("UnitPrice", quantileProbs, relError)
# 查看两列的交叉列表
df.stat.crosstab("StockCode", "Quantity").show()
# 查看频繁项
df.stat.freqItems(["StockCode", "Quantity"]).show(2, False)
# 为每一行生成唯一ID
Beispiel #6
0
# We can multiply our columns together because they were both numerical
from pyspark.sql.functions import expr, pow, round, bround
fabricatedQuantity = pow(col('Quantity') * col('UnitPrice'), 2) + 5
df.select(expr('CustomerId'), fabricatedQuantity.alias('realQunatity')).show(2)


df.selectExpr(
        'CustomerId',
        '(Power((Quantity * UnitPrice), 2.0) + 5) as realQuantity').show(2)
# select customerId, (power((Quantity * Unitprice), 2.0) + 5) as realQuantity from dfTable

df.select(round(col('UnitPrice'), 1).alias('rounded'), col('UnitPrice')).show(5)
df.selectExpr('round(UnitPrice, 1) as rounded', 'UnitPrice').show(5)

# You can use either DF API methods or just SQL Expressions
df.select(round(lit('2.5')), bround(lit('2.5'))).show(2)
df.selectExpr('round(2.5)', 'bround(2.5)').show(2)


# Unique ID
# We can add a unique ID to each row by using the function monotonically_increasing_id
# The function generates a unique value for each row, starting with 0

from pyspark.sql.functions import monotonically_increasing_id

df.select(monotonically_increasing_id()).show(2)


# Working with String

from pyspark.sql.functions import initcap, lower, upper
    def data_explore(self, df_train, df_test):

        sqlContext = SQLContext(self.sc)

        print('--------2、统计特征:count、ratio、nunique、ctr相关特征')
        print("计算交叉特征的count、类别偏好的ratio")
        count_feats_list = []
        print('cross count')
        users = ['uid']
        authors = ['author_id', 'item_city', 'channel', 'music_id', 'device']
        count_feats_list.extend([[u_col, a_col] for u_col in users
                                 for a_col in authors])

        users = ['author_id']
        authors = ['channel', 'user_city', 'item_city', 'music_id']
        count_feats_list.extend([[u_col, a_col] for u_col in users
                                 for a_col in authors])

        count_feats_list.append(['uid', 'channel', 'device'])
        count_feats_list.append(['author_id', 'item_city', 'music_id'])
        print("计算count的字段有以下这些")
        print(count_feats_list)

        for i in range(len(count_feats_list)):
            group_cols = count_feats_list[i]
            new_feature = '_'.join(group_cols)
            print("根据上述保存的df_train 和df_test 再处理2维交叉变量")
            if len(group_cols) == 2:
                print("开始处理2维交叉变量")
                df_train = df_train.withColumn(
                    new_feature,
                    fn.concat_ws(
                        '_', df_train[group_cols[0]].cast(typ.StringType()),
                        df_train[group_cols[1]].cast(typ.StringType())))
                df_test = df_test.withColumn(
                    new_feature,
                    fn.concat_ws(
                        '_', df_test[group_cols[0]].cast(typ.StringType()),
                        df_test[group_cols[1]].cast(typ.StringType())))
                df2 = df_train.groupby(new_feature).count()\
                       .withColumnRenamed('count',new_feature+'_count')
                #类别偏好的ratio比例
                count_min = df2.select(fn.min(df2[new_feature +
                                                  '_count'])).collect()[0][0]
                count_max = df2.select(fn.max(df2[new_feature +
                                                  '_count'])).collect()[0][0]
                # F.bround("Rank", scale=4)
                df2 = df2.withColumn(
                    new_feature + '_count_ratio',
                    fn.bround(
                        ((df2[new_feature + '_count'] - fn.lit(count_min)) /
                         ((fn.lit(count_max) - fn.lit(count_min)).cast(
                             typ.IntegerType()))),
                        scale=3))

                if new_feature == "uid_author_id":  #用户看了这个用户发布的视频 超过2个
                    percent_list = [0, 90, 95, 98, 100]
                if new_feature == "uid_music_id":
                    percent_list = [0, 75, 90, 95, 98, 100]
                if new_feature == "uid_device":
                    percent_list = [0, 25, 50, 75, 90, 100]
                if new_feature == "author_id_user_city":
                    percent_list = [0, 75, 90, 95, 98, 100]
                if new_feature == "author_id_music_id":
                    percent_list = [0, 75, 90, 95, 98, 100]
                else:
                    percent_list = [0, 50, 75, 90, 95, 100]

                df2 = self.bining(sqlContext, df2, new_feature + '_count',
                                  percent_list)
                print("查看df2_2")
                df2.show(1, truncate=False)
                df_train = df_train.join(df2, new_feature,
                                         'left').drop(new_feature)
                print("train")
                df_train.show(1, truncate=False)  #ratio是一个连续变量,范围0-1
                df_train.printSchema()
                df_test = df_test.join(df2, new_feature,
                                       'left').drop(new_feature)  #先关联后删除
                print("test")
                df_test.show(1, truncate=False)

            if len(group_cols) == 3:
                print("开始处理3维交叉变量")
                df_train = df_train.withColumn(
                    new_feature,
                    fn.concat_ws(
                        '_', df_train[group_cols[0]].cast(typ.StringType()),
                        df_train[group_cols[1]].cast(typ.StringType()),
                        df_train[group_cols[2]].cast(typ.StringType())))
                df_test = df_test.withColumn(
                    new_feature,
                    fn.concat_ws(
                        '_', df_test[group_cols[0]].cast(typ.StringType()),
                        df_test[group_cols[1]].cast(typ.StringType()),
                        df_test[group_cols[2]].cast(typ.StringType())))

                df3 = df_train.groupby(new_feature).count()\
                       .withColumnRenamed('count',new_feature+'_count')

                #类别偏好的ratio比例
                count_min = df3.select(fn.min(df3[new_feature +
                                                  '_count'])).collect()[0][0]
                count_max = df3.select(fn.max(df3[new_feature +
                                                  '_count'])).collect()[0][0]
                # F.bround("Rank", scale=4)
                df3 = df3.withColumn(
                    new_feature + '_count_ratio',
                    fn.bround(
                        ((df3[new_feature + '_count'] - fn.lit(count_min)) /
                         ((fn.lit(count_max) - fn.lit(count_min)).cast(
                             typ.IntegerType()))),
                        scale=3))
                # print("查看df3_1")
                # df3.show(5,truncate=False)
                percent_list = [0, 50, 75, 90, 95, 100]
                df3 = self.bining(sqlContext, df3, new_feature + '_count',
                                  percent_list)
                print("查看df3_2")
                df3.show(1, truncate=False)
                df_train = df_train.join(df3, new_feature,
                                         'left').drop(new_feature)
                print("train")
                df_train.show(1, truncate=False)
                df_train.printSchema()
                df_test = df_test.join(df3, new_feature,
                                       'left').drop(new_feature)
                print("test")
                df_test.show(1, truncate=False)

        print("交叉特征处理结束")
        print("查看train的表结构")
        df_train.printSchema()
        # print("删除没有必要的列")
        # unuse_col=['item_city','user_city','device','author_id','music_id',]  #'uid','item_id'这两列不能删除,后面提交结果的时候应该要用到
        # df_train=self.dropUnuseCols(df_train,unuse_col)
        # df_test=self.dropUnuseCols(df_test,unuse_col)

        print("表中含有为null的字段,主要产生在leftjoin的时候")
        # df_train=df_train.na.fill({'uid_author_id_count_bin':1,'uid_author_id_count_ratio':0,\
        #                            'uid_item_city_count_bin':1,'uid_item_city_count_ratio':0,\
        #                            'uid_channel_count_bin':1,'uid_channel_count_ratio':0,\
        #                            'uid_music_id_count_bin':1,'uid_music_id_count_ratio':0,\
        #                            'uid_device_count_bin':1,'uid_device_count_ratio':0,\
        #                            'author_id_channel_count_bin':1,'author_id_channel_count_ratio':0,\
        #                            'author_id_user_city_count_bin':1,'author_id_user_city_count_ratio':0,\
        #                            'author_id_item_city_count_bin':1,'author_id_item_city_count_ratio':0,\
        #                            'author_id_music_id_count_bin':1,'author_id_music_id_count_ratio':0,\
        #                            'uid_channel_device_count_bin':1,'uid_channel_device_count_ratio':0,\
        #                            'author_id_item_city_music_id_bin':1,'author_id_item_city_music_id_ratio':0
        #                            })
        df_train = df_train.na.fill({
            'user_city_count_bin': 1,
            'user_city_count_ratio': 0
        })
        #user_city_count_bin,device_count_bin  这两个是step1_single中漏掉的两个字段
        df_test=df_test.na.fill({'user_city_count_bin':1,'user_city_count_ratio':0,\
                                 'device_count_bin':-1,'device_count_ratio':0,\
                                   'uid_author_id_count_bin':1,'uid_author_id_count_ratio':0,\
                                   'uid_item_city_count_bin':1,'uid_item_city_count_ratio':0,\
                                   'uid_channel_count_bin':1,'uid_channel_count_ratio':0,\
                                   'uid_music_id_count_bin':1,'uid_music_id_count_ratio':0,\
                                   'uid_device_count_bin':1,'uid_device_count_ratio':0,\
                                   'author_id_channel_count_bin':1,'author_id_channel_count_ratio':0,\
                                   'author_id_user_city_count_bin':1,'author_id_user_city_count_ratio':0,\
                                   'author_id_item_city_count_bin':1,'author_id_item_city_count_ratio':0,\
                                   'author_id_music_id_count_bin':1,'author_id_music_id_count_ratio':0,\
                                   'uid_channel_device_count_bin':1,'uid_channel_device_count_ratio':0,\
                                   'author_id_item_city_music_id_count_bin':1,'author_id_item_city_music_id_count_ratio':0
                                   })

        print("查看test缺失值")
        df_test.agg(*[(1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing')
                      for c in df_test.columns]).show()
        print("查看train缺失值")  #以防万一,可能会漏掉哪个字段
        df_train.agg(*[(1 - (fn.count(c) / fn.count('*'))).alias(c +
                                                                 '_missing')
                       for c in df_train.columns]).show()

        print('-------5.保存数据预处理结果-------')
        test_file_path = self.parser.get(
            "hdfs_path", "hdfs_data_path") + 'actLog_test_single_cross'
        os.system("hadoop fs -rm -r {}".format(test_file_path))
        df_test.rdd.map(tuple).saveAsPickleFile(test_file_path)

        del df_test
        gc.collect()

        train_file_path = self.parser.get(
            "hdfs_path", "hdfs_data_path") + 'actLog_train_single_cross'
        os.system("hadoop fs -rm -r {}".format(
            train_file_path))  #os.system(command) 其参数含义如下所示: command 要执行的命令
        df_train.rdd.map(tuple).saveAsPickleFile(train_file_path)
Beispiel #8
0
#handling null data in boolean expressions

df.where(col("Description").eqNullSafe("hello")).show()

#working with numbers

fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show()

##rounding

df.select(round(col("UnitPrice"), 1).alias("rounded")).show(5)
df.select(
    round(lit("2.5")).alias("rounded"),
    bround(lit("2.5")).alias("brounded")).show(2)

##basic statistics

df.describe().show()

##using the stat package

###find correlation

df.stat.corr("UnitPrice", "Quantity")
df.select(corr("Quantity", "UnitPrice")).show()

from pyspark.sql.functions import count, mean, max, min, stddev_pop

quantileProbs = [0.5]
    def data_describe(self):
        print('start to read data for rdd:')
        rawRdd_face = self.read_rdd('track2_face_attrs.txt').map(lambda line : json.loads(line))
        # rawRdd_face.cache()
        global keys
        keys=['item_id','gender','beauty','relative_position']
        rawRdd_face2=rawRdd_face.map(lambda dic:{key :jsonpath.jsonpath(dic,'$..'+key)[0] if jsonpath.jsonpath(dic,'$..'+key) else None  for key in keys})
        print(rawRdd_face2.take(10))
        #转化为dataframe,在不指定schema的情况下会自动推断
        sqlContext = SQLContext(self.sc)
        labels=[
            ('item_id',typ.IntegerType()),
            ('gender',typ.IntegerType()),
            ('beauty',typ.FloatType()),
            ('relative_position',typ.ArrayType(typ.FloatType()))]
        Schema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels])
        df = sqlContext.createDataFrame(rawRdd_face2,Schema)

        attrs = self.sc.parallelize(["relative_position_" + str(i) for i in range(4)]).zipWithIndex().collect()
        print("列名:", attrs)
        for name, index in attrs:
            df = df.withColumn(name, fn.bround(df['relative_position'].getItem(index), scale=3))
        #删除 relative_position
        df_face =df.drop('relative_position')
        del df
        gc.collect()


        # print('-------保存df_face数据-------')
        # file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'face_feature'
        # os.system("hadoop fs -rm -r {}".format(file_path))  #os.system(command) 其参数含义如下所示: command 要执行的命令
        # df_face.rdd.map(tuple).saveAsPickleFile(file_path)
        # print('数据保存结束')

        print('start to read act data  only for uid and item_id :')
        rawRdd_train = self.read_rdd('final_track2_train.txt').map(lambda line : line.split('\t'))
        rawRdd_test = self.read_rdd('final_track2_test_no_anwser.txt').map(lambda line : line.split('\t'))
        actionLogRdd_train = rawRdd_train.map(
            lambda x :(int(x[0]), int(x[2])))
        # total = actionLogRdd_train.count()
        # print('total: ' + str(total))
        actionLogRdd_test = rawRdd_test.map(
            lambda x :(int(x[0]), int(x[2])))

        sqlContext = SQLContext(self.sc)
        labels=[('uid',typ.IntegerType()),
            ('item_id',typ.IntegerType())
            ]

        actionLogSchema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels])

        dfactionLog_train = sqlContext.createDataFrame(actionLogRdd_train, actionLogSchema)
        dfactionLog_test = sqlContext.createDataFrame(actionLogRdd_test, actionLogSchema)

        #根据item_id进行关联
        df_face=df_face.select(["item_id","gender","beauty"])
        df_uid_face_test=dfactionLog_test.select(["uid","item_id"]).join(df_face,'item_id','left').drop("item_id")
        df_uid_face_train=dfactionLog_train.select(["uid","item_id"]).join(df_face,'item_id','left').drop("item_id")
        del dfactionLog_test
        del dfactionLog_train
        gc.collect()

        #只对训练集中的uid进行处理
        gdf=df_uid_face_train.groupby("uid")
        df1=gdf.agg(fn.max("beauty").alias("uid_max_beauty"),fn.bround(fn.avg("beauty"),scale=3).alias("uid_avg_beauty"),fn.bround((fn.sum("gender")/fn.count("gender")),scale=3).alias("uid_male_ratio"))
        df1.show(1,truncate=False)
        #最终只保留df1即可
        print('-------保存uid_face数据-------')
        file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'uid_face_train'
        os.system("hadoop fs -rm -r {}".format(file_path))  #os.system(command) 其参数含义如下所示: command 要执行的命令
        df1.rdd.map(tuple).saveAsPickleFile(file_path)
        print('数据保存结束')
Beispiel #10
0
    # 2. calculate the number of words in query
	q_n = len(query_lst)
	# 3. search for query words in TFIDF and aggregate base on each document 
	# to summerize tf_idf and calculate the frequency of query words in each document
    search = TFIDF.filter(TFIDF.token.isin(query_lst)).groupby(TFIDF._id)\
	.agg(F.sum("tf_idf").alias("sum_tf_idf"), F.count("tf_idf").alias('freq'))      

	# 4. the score is calculated by multiplying sum_tf_idf to frequency of query words in document 
	# and dividing it to the number of words in query
	# in last step, order the results by highest scores and get N top of them as output
    search = search.select((search._id).alias("id") , (search.sum_tf_idf * search.freq / q_n).alias('scores'))\
	.orderBy("scores", ascending =False).limit(N)
	
	# 5. In the end, join the search output with original data framework to fetch text_entry
	# and select _id , rounded score (3 decimal) and text_entry as result of search
    search = search.join(df, df._id == search.id).select(search.id, F.bround(search.scores,3), "text_entry").orderBy("scores", ascending =False)    
    return search.collect()
    
    

def print_result (query ,result):
	#print search output in format of tuple
    print('Search Term :' , query)
    for i in result:
        print(tuple(i))
    print('-'* 50)
    
def main(sc):

	sqlcontext = SQLContext(sc)
Beispiel #11
0
                                                           'SecurityDelay_present',
                                                           'LateAircraftDelay_present'])

DelayCountsDF = DelayTypesDF.groupBy('Region').sum()

newDF = Total_per_regionDF.join(DelayCountsDF,"Region")
newDF.cache()
finalDF = newDF.select(newDF.columns)
finalDF.cache()

## Running a loop to generate columns with ratios of each different type of delay
for c in ['sum(ArrDelay_present)','sum(DepDelay_present)','sum(CarrierDelay_present)','sum(WeatherDelay_present)','sum(NASDelay_present)','sum(SecurityDelay_present)','sum(LateAircraftDelay_present)']:
    finalDF = finalDF.withColumn('proportion_of_'+c,(col(c).cast(DoubleType())/col('Total_flights').cast(DoubleType())))

finalDF.select('Region',
              bround(col('proportion_of_sum(ArrDelay_present)'),3).alias('ratio_ArrD'),
              bround(col('proportion_of_sum(DepDelay_present)'),3).alias('ratio_DepD'),
              bround(col('proportion_of_sum(CarrierDelay_present)'),3).alias('ratio_CarrierD'),
              bround(col('proportion_of_sum(WeatherDelay_present)'),3).alias('ratio_WeaD'),
              bround(col('proportion_of_sum(NASDelay_present)'),3).alias('ratio_NASD'),
              bround(col('proportion_of_sum(SecurityDelay_present)'),3).alias('ratio_SecD'),
              bround(col('proportion_of_sum(LateAircraftDelay_present)'),3).alias('ratio_LateACD')).show()


# ## 2. Which airports are more likely to cause security delays when a flight departs from them? What about the destination airport?
# <font color=blue> Security delays cause very negative experiences to travelers, since they are related to evacuation of terminals, re-boarding of aircraft, among others. The analysis aims to show the likelihood of security delays considering the number of incidents divided by the total amount of flights. The airports that had the highest number of incidents for security delays both as an arrival and departure is Las Vegas airport. It is interesting to notice that top 10 list of flights is the same both for origin and destination, with a slight difference that Phoenix airport is more likely to cause security delay as a destination airport, while Chicago Midway is more likely as an origin airport. </font>

# In[8]:


# Which airports are more likely to cause security delays when a flight departs from them? What about the destination airport?
    def data_explore(self, df_train, df_test):

        sqlContext = SQLContext(self.sc)

        print(
            "------------1、通过时间戳获取年月日时分,(没有工作日特征,月日交叉表示节日特征,年份转化有问题)-----------------"
        )

        #作品发布时间-作品发布的最早时间,转化为day
        time_min = df_train.select(fn.min(df_train['time'])).collect()
        df_train = df_train.withColumn(
            'time_day', ((df_train.time - fn.lit(time_min[0][0])) /
                         fn.lit(3600 * 24)).cast(typ.IntegerType()))
        # df_train=df_train.withColumn('time_strDate',fn.from_unixtime(df_train.time , "yyyy-MM-dd HH:mm:ss"))
        #将 unix 格式的时间戳转换为指定格式的日期,提取小时
        df_train = df_train.withColumn(
            'item_pub_month',
            fn.from_unixtime(df_train.time, "M").cast(typ.IntegerType()))
        df_train = df_train.withColumn(
            'item_pub_day',
            fn.from_unixtime(df_train.time, "d").cast(typ.IntegerType()))
        df_train = df_train.withColumn(
            'item_pub_hour',
            fn.from_unixtime(df_train.time, "k").cast(typ.IntegerType()))
        df_train = df_train.withColumn(
            'item_pub_minute',
            fn.from_unixtime(df_train.time, "m").cast(typ.IntegerType()))
        print("查看month,day,hour,minute的提取是否正确")
        df_train.show(truncate=False)
        df_train = df_train.drop('time')
        #对时间提取的这部分字段进行count后进行分箱并不明显,就直接当作类别变量处理就可以了,另外增加pos_neg_ratio特征

        df_test = df_test.withColumn(
            'time_day', ((df_test.time - fn.lit(time_min[0][0])) /
                         fn.lit(3600 * 24)).cast(typ.IntegerType()))
        df_test = df_test.withColumn(
            'item_pub_month',
            fn.from_unixtime(df_test.time, "M").cast(typ.IntegerType()))
        df_test = df_test.withColumn(
            'item_pub_day',
            fn.from_unixtime(df_test.time, "d").cast(typ.IntegerType()))
        df_test = df_test.withColumn(
            'item_pub_hour',
            fn.from_unixtime(df_test.time, "k").cast(typ.IntegerType()))
        df_test = df_test.withColumn(
            'item_pub_minute',
            fn.from_unixtime(df_test.time, "m").cast(typ.IntegerType()))
        df_test = df_test.drop('time')

        print('--------2、统计特征:count、ratio、nunique、ctr相关特征')
        print("计算基础特征的count、类别偏好的ratio")
        count_feats_list = []

        print('single feature count')
        count_feats_list.extend([[c] for c in df_train.columns if c not in [
            'time', 'channel', 'like', 'finish', 'duration_time', "time_day",
            "item_pub_month", "item_pub_day", "item_pub_hour",
            "item_pub_minute"
        ]])
        print(count_feats_list)

        print("计算count的字段有以下这些")
        print(count_feats_list)

        for i in range(len(count_feats_list)):
            group_cols = count_feats_list[i]
            new_feature = '_'.join(group_cols)
            #判断是几维交叉特征,并进行拼接,再计算每个特征值的个数count,并完成映射
            if len(group_cols) == 1:
                if new_feature in ["music_id", 'user_city', 'item_city']:
                    df1 = df_train.where(df_train[new_feature]!=-1).groupby(new_feature).count()\
                            .withColumnRenamed('count',new_feature+'_count')
                else:
                    df1 = df_train.groupby(new_feature).count()\
                            .withColumnRenamed('count',new_feature+'_count')
                if new_feature != "uid":
                    #类别偏好的ratio比例
                    count_min = df1.select(fn.min(
                        df1[new_feature + '_count'])).collect()[0][0]
                    count_max = df1.select(fn.max(
                        df1[new_feature + '_count'])).collect()[0][0]
                    # F.bround("Rank", scale=4)
                    df1 = df1.withColumn(
                        new_feature + '_count_ratio',
                        fn.bround((
                            (df1[new_feature + '_count'] - fn.lit(count_min)) /
                            ((fn.lit(count_max) - fn.lit(count_min)).cast(
                                typ.IntegerType()))),
                                  scale=3))
                if new_feature == "device":  #[1.0, 16.0, 46.0, 102.0, 204.0, 410.0, 10389.0] 修改
                    percent_list = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
                elif new_feature == "author_id":  #[1.0, 2.0, 7.0, 32.0, 78.0, 276186.0]
                    percent_list = [0, 50, 75, 90, 95, 100]
                elif new_feature == "music_id":  #[1.0, 3.0, 13.0, 73.0, 211.0, 193640.0]
                    percent_list = [0, 50, 75, 90, 95, 100]  #每个percent_list不相同
                elif new_feature == "uid":  #分箱[1.0, 104.0, 329.0, 741.0, 1131.0, 10389.0]
                    percent_list = [0, 50, 75, 90, 95, 100]
                elif new_feature == "item_id":  #[1.0, 1.0, 2.0, 7.0, 14.0, 6911.0]  分箱修改
                    percent_list = [0, 75, 90, 95, 100]
                elif new_feature == "user_city":  #[1.0, 21935.5, 54519.5, 110179.0, 146319.75, 3789087.0] 修改
                    percent_list = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
                elif new_feature == "item_city":  #[1.0, 14725.0, 48576.0, 122887.0, 206845.5, 744265.0]  修改
                    percent_list = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
                else:
                    percent_list = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

                df1 = self.bining(sqlContext, df1, new_feature + '_count',
                                  percent_list)
                # print(df1.show(5,truncate=False))
                df_train = df_train.join(df1, new_feature, 'left')
                print("train")
                df_train.show(5, truncate=False)  #ratio是一个连续变量,范围0-1
                df_test = df_test.join(df1, new_feature, 'left')

                print("test")
                df_test.show(5, truncate=False)
                del df1
                gc.collect()
        print("输出所有一维特征处理后的结果")

        print("表中含有为null的字段,主要产生在leftjoin的时候")
        df_train=df_train.na.fill({'uid_count_bin':-1,\
                                   'user_city_count_bin':1,'user_city_count_ratio':0,\
                                   'item_id_count_bin':-1,'item_id_count_ratio':0,\
                                   'author_id_count_bin':-1,'author_id_count_ratio':0,\
                                   'item_city_count_bin':-1,'item_city_count_ratio':0,\
                                   'music_id_count_bin':-1,'music_id_count_ratio':0,\
                                   'device_count_bin':-1,'device_count_ratio':0
                                   })
        df_test=df_test.na.fill({'uid_count_bin':-1,\
                                 'user_city_count_bin':1,'user_city_count_ratio':0,\
                                 'item_id_count_bin':-1,'item_id_count_ratio':0,\
                                 'author_id_count_bin':-1,'author_id_count_ratio':0,\
                                 'item_city_count_bin':-1,'item_city_count_ratio':0,\
                                 'music_id_count_bin':-1,'music_id_count_ratio':0,\
                                 'device_count_bin':-1,'device_count_ratio':0
                                 })

        print("查看表结构")
        df_train.printSchema()
        df_train.printSchema()
        print("查看train缺失值")
        df_train.agg(*[(1 - (fn.count(c) / fn.count('*'))).alias(c +
                                                                 '_missing')
                       for c in df_train.columns]).show()

        print("查看test缺失值")
        df_test.agg(*[(1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing')
                      for c in df_test.columns]).show()

        print('-------5.一维数据处理结束后保存-------')
        test_file_path = self.parser.get(
            "hdfs_path", "hdfs_data_path") + 'actLog_test_single'
        os.system("hadoop fs -rm -r {}".format(test_file_path))
        df_test.rdd.map(tuple).saveAsPickleFile(test_file_path)

        del df_test
        gc.collect()

        train_file_path = self.parser.get(
            "hdfs_path", "hdfs_data_path") + 'actLog_train_single'
        os.system("hadoop fs -rm -r {}".format(
            train_file_path))  #os.system(command) 其参数含义如下所示: command 要执行的命令
        df_train.rdd.map(tuple).saveAsPickleFile(train_file_path)
Beispiel #13
0
sum(donations.givings) AS givings,avg(donations.avggivings) AS avggivings \
FROM donations, accounts WHERE donations.accountid = accounts.accountid \
GROUP BY accounts.assignedto, donations.year")

stat = spark.sql("SELECT cities.state, cities.country, \
sum(donations.givings) AS givings, avg(donations.avggivings) AS avggivings \
FROM donations, cities WHERE donations.city = cities.city \
GROUP BY cities.state, cities.country")

lvlyear.createOrReplaceTempView("lvlyears")
nameyear.createOrReplaceTempView("nameyears")
stat.createOrReplaceTempView("stats")

#Q1 on OLAP
nameyear.select("assignedto", "year",
                bround("givings").alias("givings")).groupBy(
                    "assignedto", "year").sum().orderBy("assignedto",
                                                        "year").show()
#Q2 on OLAP
lvlyear.select("donorclass", "year",
               bround("givings").alias("givings")).orderBy(
                   "donorclass", "year").show()
#Q3 on OLAP
nameyear.select("assignedto","year",bround("givings").alias("yearly"),\
bround(sum("givings").over(Window.partitionBy("assignedto"))).alias("totperacc"), \
bround(nameyear["givings"]/sum("givings").over(Window.partitionBy("assignedto")),2).alias("perc")).orderBy("assignedto","year").show()
#Q4 on OLAP
donation.groupBy("year").agg(sum("givings").alias("tot")).orderBy(
    "tot", ascending=False).show()
#Q5 on OLAP
stat.groupBy("state", "country").agg(sum("givings").alias("tot")).orderBy(
print(
    retail_df.select(expr("CustomerId"), 'unitPrice', 'quantity',
                     actual_quantity.alias("actual_uantity")).show(3))

print(
    retail_df.selectExpr(
        "CustomerId", "unitPrice", "quantity",
        "(POWER((Quantity * UnitPrice), 2.0) + 5) as actual_quantity").show(3))
#%%
"rounding things"
from pyspark.sql.functions import round, bround

print(
    retail_df.select('unitPrice', round('unitPrice', 1),
                     bround('unitprice', 1)).show(5))
#%%
"Pearson correlation coefficient for two columns"

from pyspark.sql.functions import corr
print(retail_df.stat.corr("Quantity", "UnitPrice"))
print(retail_df.select(corr("Quantity", "UnitPrice")).show())
#%%
"Summary stats"
print(retail_df.describe().show())

"get summary stats manualy"
from pyspark.sql.functions import count, mean, stddev_pop, min, max

print(
    retail_df.select(
location_wise_profit = spark.sql(location_wise_profit_sql)
location_wise_profit.coalesce(1).write.format('com.databricks.spark.csv').save(
    '../output/location_wise_profit.csv', header='true')

# Top Selling products in New York
top_selling_prod_category = \
df.filter(df.STORE_LOCATION == 'New York') \
    .select(['PRODUCT_CATEGORY','MRP','CP','SP']) \
    .groupBy(['PRODUCT_CATEGORY']) \
    .agg(F.min('MRP').alias("MIM_MPR"), \
        F.max('MRP').alias('MAX_MRP') ,\
        F.min('CP').alias('MIN_CP'), \
        F.max('CP').alias('MAX_CP'), \
        F.sum('MRP').alias('SUM_MRP'), \
        F.bround(F.sum("CP"),2).alias('SUM_CP'),\
        F.bround(F.sum('SP'),2).alias('SUM_SP'), \
        F.count('MRP').alias('COUNT_MRP')) \
    .orderBy(F.col('COUNT_MRP').desc()).limit(5)
# Write data back to output file
top_selling_prod_category.coalesce(1).write.format(
    'com.databricks.spark.csv').save('../output/top_selling_prod_category.csv',
                                     header='true')

# Working with date
date_data_output = \
df.select( \
    F.dayofyear(df['Date']).alias("DAY_OF_YEAR"),
    F.month(df['Date']).alias("MONTH"),
    F.dayofweek(df['Date']).alias("DAY_OF_WEEEK"),
    F.dayofmonth(df['Date']).alias("DAY_OF_MONTH"),
Beispiel #16
0
 def city_col_deal(self, df, col):
     df_city_score=df.groupBy(col).avg('finish', 'like') \
         .withColumnRenamed("avg(finish)","avg_finish").withColumnRenamed("avg(like)","avg_like")
     df_city_score=df_city_score.withColumn(col+'_score', df_city_score.avg_finish*0.7+df_city_score.avg_like*0.3)\
                           .select(col,fn.bround(col+'_score', scale=4).alias(col+'_score'))
     return df_city_score
    def test_round_in_pyspark(self):
        df = self.df.select(round(lit("2.5")), bround(lit("2.5"))).collect()

        for x in df:
            self.assertEqual(x['round(2.5, 0)'], 3.0)
            self.assertEqual(x['bround(2.5, 0)'], 2.0)
Beispiel #18
0
    def data_describe(self):
        sqlContext = SQLContext(self.sc)
        rootPath = self.parser.get("hdfs_path", "hdfs_data_path")
        print('start to read actLog_test_single_cross')
        test_file_path = rootPath + 'actLog_test_single_cross'
        actLog_test_rdd = self.sc.pickleFile(test_file_path)
        #labels需要修改
        labels = [
            ('duration_time', typ.IntegerType()),
            ('device', typ.IntegerType()),
            ('music_id', typ.IntegerType()),
            ('item_city', typ.IntegerType()),
            ('author_id', typ.IntegerType()),
            ('item_id', typ.IntegerType()),
            ('user_city', typ.IntegerType()),
            ('uid', typ.IntegerType()),
            ('channel', typ.IntegerType()),
            ('finish', typ.IntegerType()),
            ('like', typ.IntegerType()),
            ('time_day', typ.IntegerType()),
            ('item_pub_month', typ.IntegerType()),
            ('item_pub_day', typ.LongType()),
            ('item_pub_hour', typ.IntegerType()),
            ('item_pub_minute', typ.IntegerType()),
            ('uid_count_bin', typ.IntegerType()),
            ('user_city_count_bin', typ.IntegerType()),
            ('user_city_count_ratio', typ.DoubleType()),
            ('item_id_count_bin', typ.IntegerType()),
            ('item_id_count_ratio', typ.DoubleType()),
            ('author_id_count_bin', typ.IntegerType()),
            ('author_id_count_ratio', typ.DoubleType()),
            ('item_city_count_bin', typ.IntegerType()),
            ('item_city_count_ratio', typ.DoubleType()),
            ('music_id_count_bin', typ.IntegerType()),
            ('music_id_count_ratio', typ.DoubleType()),
            ('device_count_bin', typ.IntegerType()),
            ('device_count_ratio', typ.DoubleType()),
            ('uid_author_id_count_bin', typ.IntegerType()),
            ('uid_author_id_count_ratio', typ.DoubleType()),
            ('uid_item_city_count_bin', typ.IntegerType()),
            ('uid_item_city_count_ratio', typ.DoubleType()),
            ('uid_channel_count_bin', typ.IntegerType()),
            ('uid_channel_count_ratio', typ.DoubleType()),
            ('uid_music_id_count_bin', typ.IntegerType()),
            ('uid_music_id_count_ratio', typ.DoubleType()),
            ('uid_device_count_bin', typ.IntegerType()),
            ('uid_device_count_ratio', typ.DoubleType()),
            ('author_id_channel_count_bin', typ.IntegerType()),
            ('author_id_channel_count_ratio', typ.DoubleType()),
            ('author_id_user_city_count_bin', typ.IntegerType()),
            ('author_id_user_city_count_ratio', typ.DoubleType()),
            ('author_id_item_city_count_bin', typ.IntegerType()),
            ('author_id_item_city_count_ratio', typ.DoubleType()),
            ('author_id_music_id_count_bin', typ.IntegerType()),
            ('author_id_music_id_count_ratio', typ.DoubleType()),
            ('uid_channel_device_count_bin',
             typ.IntegerType()),  #改成uid_channel_device
            ('uid_channel_device_count_ratio',
             typ.DoubleType()),  #改成uid_channel_device
            ('author_id_item_city_music_id_count_bin', typ.IntegerType()),
            ('author_id_item_city_music_id_count_ratio', typ.DoubleType()),
        ]

        actionLogSchema = typ.StructType(
            [typ.StructField(e[0], e[1], True) for e in labels])

        df_actLog_test = sqlContext.createDataFrame(actLog_test_rdd,
                                                    actionLogSchema)
        df_actLog_test.show(1, truncate=False)
        # df_actLog_test.printSchema()

        print('start to read actLog_train_step2')
        train_file_path = rootPath + 'actLog_train_single_cross'
        actLog_train_rdd = self.sc.pickleFile(train_file_path)
        # print(actLog_train_rdd.take(5))
        df_actLog_train = sqlContext.createDataFrame(actLog_train_rdd,
                                                     actionLogSchema)

        print("对duration_time和time_day 根据finish、like进行分组")

        def DurationLikeBin(x):
            if x <= 2:
                return 1
            elif 2 < x <= 12:
                return 2
            elif 12 < x <= 15:
                return 3
            elif 15 < x <= 22:
                return 4
            elif 22 < x <= 42:
                return 5
            else:
                return 6

        converDurationLikeBin = udf(lambda x: DurationLikeBin(x),
                                    typ.IntegerType())
        df_actLog_train = df_actLog_train.withColumn(
            "duration_time_bin_like",
            converDurationLikeBin(df_actLog_train.duration_time))
        df_actLog_test = df_actLog_test.withColumn(
            "duration_time_bin_like",
            converDurationLikeBin(df_actLog_test.duration_time))

        def DurationFinishBin(x):
            if x <= 2:
                return 1
            elif 2 < x <= 12:
                return 2
            elif 12 < x <= 26:
                return 3
            elif 26 < x <= 42:
                return 4
            else:
                return 5

        converDurationFinishBin = udf(lambda x: DurationFinishBin(x),
                                      typ.IntegerType())
        df_actLog_train = df_actLog_train.withColumn(
            "duration_time_bin_finish",
            converDurationFinishBin(df_actLog_train.duration_time))
        df_actLog_test = df_actLog_test.withColumn(
            "duration_time_bin_finish",
            converDurationFinishBin(df_actLog_test.duration_time))

        def TimeLikeBin(x):
            if x >= 822:
                return 1
            elif 810 <= x < 822:
                return 2
            elif 781 <= x < 810:
                return 3
            elif 748 <= x < 781:
                return 4
            elif 726 <= x < 748:
                return 5
            elif 646 <= x < 726:
                return 6
            else:
                return 7

        converTimeLikeBin = udf(lambda x: TimeLikeBin(x), typ.IntegerType())
        df_actLog_train = df_actLog_train.withColumn(
            "time_day_bin_like", converTimeLikeBin(df_actLog_train.time_day))
        df_actLog_test = df_actLog_test.withColumn(
            "time_day_bin_like", converTimeLikeBin(df_actLog_test.time_day))

        def TimeFinshBin(x):
            if x >= 795:
                return 1
            elif 792 <= x < 795:
                return 2
            elif 632 <= x < 792:
                return 3
            else:
                return 4

        converTimeFinshBinBin = udf(lambda x: TimeFinshBin(x),
                                    typ.IntegerType())
        df_actLog_train = df_actLog_train.withColumn(
            "time_day_bin_finish",
            converTimeFinshBinBin(df_actLog_train.time_day))
        df_actLog_test = df_actLog_test.withColumn(
            "time_day_bin_finish",
            converTimeFinshBinBin(df_actLog_test.time_day))

        #删除原始列
        print("删除没有必要的列")
        unuse_col = [
            'item_city', 'user_city', 'device', 'author_id', 'music_id',
            "duration_time", "time_day"
        ]  #'uid','item_id'这两列不能删除,后面提交结果的时候应该要用到
        unuse_col = unuse_col + [
            'item_pub_month', 'item_pub_day', 'item_pub_minute'
        ]
        df_actLog_train = self.dropUnuseCols(df_actLog_train, unuse_col)
        df_actLog_test = self.dropUnuseCols(df_actLog_test, unuse_col)

        # df_actLog_train=df_actLog_train.drop("duration_time").drop("time_day")
        # df_actLog_test=df_actLog_test.drop("duration_time").drop("time_day")
        # df_actLog_train.show(1,truncate=False)
        # df_actLog_train.printSchema()

        print('start to read nlp_topic_feature2')
        nlp_file_path = rootPath + 'nlp_topic_feature2'
        nlp_topic_rdd = self.sc.pickleFile(nlp_file_path)
        # item_id|title_features |title_words_unique|title_length|title_features_1 |title_topic|
        df_nlp_topic = nlp_topic_rdd.toDF([
            'item_id', "title_features", 'title_words_unique', 'title_length',
            "title_features_1", 'title_topic'
        ])
        #删除无用列
        df_nlp_topic = df_nlp_topic.drop("title_features")
        df_nlp_topic = df_nlp_topic.drop("title_features_1")
        # df_nlp_topic.show(2)
        # df_nlp_topic.printSchema()

        print("start to read face_feature")
        face_file_path = rootPath + 'face_feature'
        face_rdd = self.sc.pickleFile(face_file_path)
        labels = [('item_id', typ.IntegerType()),
                  ('gender', typ.IntegerType()), ('beauty', typ.DoubleType()),
                  ('relative_position_0', typ.DoubleType()),
                  ('relative_position_1', typ.DoubleType()),
                  ('relative_position_2', typ.DoubleType()),
                  ('relative_position_3', typ.DoubleType())]
        faceSchema = typ.StructType(
            [typ.StructField(e[0], e[1], True) for e in labels])
        df_face = sqlContext.createDataFrame(face_rdd, faceSchema)
        #对所有这些列控制小数位数
        df_face = df_face.withColumn('relative_position_0',
                                     fn.bround('relative_position_0', scale=3))
        df_face = df_face.withColumn('relative_position_1',
                                     fn.bround('relative_position_1', scale=3))
        df_face = df_face.withColumn('relative_position_2',
                                     fn.bround('relative_position_2', scale=3))
        df_face = df_face.withColumn('relative_position_3',
                                     fn.bround('relative_position_3', scale=3))

        # df_face=df_face.repartition(300)

        print('start to read uid_item_face_feature')
        face_trainfile_path = rootPath + 'df_uid_face_train'
        face_testfile_path = rootPath + 'df_uid_face_test'

        face_trainrdd = self.sc.pickleFile(face_trainfile_path)
        face_testrdd = self.sc.pickleFile(face_testfile_path)
        #在concate的时候关于item_id和uid进行关联
        #labels也需要修改一下
        labels = [('uid', typ.IntegerType()),
                  ('uid_max_beauty', typ.DoubleType()),
                  ('uid_avg_beauty', typ.DoubleType()),
                  ('uid_male_ratio', typ.DoubleType())]
        itemfaceSchema = typ.StructType(
            [typ.StructField(e[0], e[1], True) for e in labels])
        df_face_train = sqlContext.createDataFrame(face_trainrdd,
                                                   itemfaceSchema)
        df_face_test = sqlContext.createDataFrame(face_testrdd, itemfaceSchema)
        # 去重前记录条数 2761799
        # 去重后记录条数 32615
        df_face_train = df_face_train.dropDuplicates()
        df_face_test = df_face_test.dropDuplicates()

        df_face_train = df_face_train.withColumn(
            'uid_max_beauty', fn.bround('uid_max_beauty', scale=3))
        df_face_train = df_face_train.withColumn(
            'uid_avg_beauty', fn.bround('uid_avg_beauty', scale=3))
        df_face_train = df_face_train.withColumn(
            'uid_male_ratio', fn.bround('uid_male_ratio', scale=3))
        # df_face.show()
        # df_face_train.printSchema()

        print("三表进行关联")  #三个表的数据量不大,但是关联后数据量却比df_actLog_test增加近1000倍
        df_test=df_actLog_test.join(df_nlp_topic,'item_id','left')\
                      .join(df_face,'item_id','left')\
                      .join(df_face_test,"uid",'left')

        df_train=df_actLog_train.join(df_nlp_topic,'item_id','left')\
                       .join(df_face,'item_id','left')\
                      .join(df_face_train,"uid",'left')

        # df_train.show(1,truncate=False)
        print("查看表结构")
        print("schema,为下一步build_data读取数据做准备")
        df_train.printSchema()
        df_test.printSchema()

        print(
            "--------观察新增列uid_max_beauty,uid_avg_beauty,uid_male_ratio是否存在缺失值----"
        )

        # print('查看训练集中每一列的缺失比例')
        # music_id_count_bin_missing|music_id_count_ratio_missing  0.64   因为含有-1值
        #
        # df_train.agg(*[(1-(fn.count(c) /fn.count('*'))).alias(c+'_missing') for c in df_train.columns]).show()
        #
        # print('查看测试集中每一列的缺失比例')
        #
        # df_test.agg(*[(1-(fn.count(c) /fn.count('*'))).alias(c+'_missing') for c in df_test.columns]).show()

        #三表关联后,有些item_id是没有title的,导致title部分数据可能存在nan值,这里要进行缺失值填充
        #类别变量填充-1,连续变量用均值填充
        #对'title_topic','title_words_unique','title_length'这一列填充-1即可

        #这里music_id_count_bin应该在之前已经填充好了,可以去掉
        df_train=df_train.na.fill({'music_id_count_bin':-1,'music_id_count_ratio':0,\
                                   'title_words_unique':-1,'title_length':-1,'title_topic': -1})
        df_test=df_test.na.fill({'music_id_count_bin':-1,'music_id_count_ratio':0,\
                                 'title_words_unique':-1,'title_length':-1,'title_topic': -1})

        #对face特征进行处理
        #关于uid进行分组,即用户看的每个用户看的所有item_id中max_beauty,avg_beauty
        #对连续变量填充缺失值
        print('输出各均值')
        mean_beauty = 0.53
        mean_relative_position0 = 0.392
        mean_relative_position1 = 0.228
        mean_relative_position2 = 0.212
        mean_relative_position3 = 0.164
        mean_max_beauty = 0.792
        mean_avg_beauty = 0.53
        #下面注释部分是均值的计算,运行中为减少内存消耗和加快训练速度,直接把之前计算的结果用上了

        # df=df_train.union(df_test)
        # mean_val = df.select(fn.mean(df['beauty'])).collect()
        # mean_beauty = round(mean_val[0][0],3) # to show the number
        # print(mean_beauty)
        # mean_val = df.select(fn.mean(df['relative_position_0'])).collect()
        # mean_relative_position0 = round(mean_val[0][0],3) # to show the number
        # print(mean_relative_position0)
        # mean_val = df.select(fn.mean(df['relative_position_1'])).collect()
        # mean_relative_position1 = round(mean_val[0][0] ,3)# to show the number
        # print(mean_relative_position1)
        # mean_val = df.select(fn.mean(df['relative_position_2'])).collect()
        # mean_relative_position2 = round(mean_val[0][0],3) # to show the number
        # print(mean_relative_position2)
        # mean_val = df.select(fn.mean(df['relative_position_3'])).collect()
        # mean_relative_position3 = round(mean_val[0][0],3) # to show the number
        # print(mean_relative_position3)
        # mean_val = df.select(fn.mean(df['uid_max_beauty'])).collect()
        # mean_max_beauty = round(mean_val[0][0],3) # to show the number
        # print(mean_max_beauty)
        # mean_val = df.select(fn.mean(df['uid_avg_beauty'])).collect()
        # mean_avg_beauty = round(mean_val[0][0],3) # to show the number
        # print(mean_avg_beauty)

        # del df
        gc.collect()

        df_train=df_train.na.fill({'gender': -1, 'beauty': mean_beauty,'relative_position_0': mean_relative_position0, \
                       'relative_position_1': mean_relative_position1,'relative_position_2': mean_relative_position2,\
                       'relative_position_3': mean_relative_position3 ,
                        'uid_max_beauty':mean_max_beauty, 'uid_avg_beauty':mean_avg_beauty, 'uid_male_ratio':0.5})
        df_test=df_test.na.fill({'gender': -1, 'beauty': mean_beauty,'relative_position_0': mean_relative_position0, \
                       'relative_position_1': mean_relative_position1,'relative_position_2': mean_relative_position2,\
                       'relative_position_3': mean_relative_position3 ,
                        'uid_max_beauty':mean_max_beauty, 'uid_avg_beauty':mean_avg_beauty, 'uid_male_ratio':0.5})

        #
        print('填充缺失以后')
        # print('查看训练集中每一列的缺失比例')
        # df_train.agg(*[(1-(fn.count(c) /fn.count('*'))).alias(c+'_missing') for c in df_train.columns]).show()
        # print('查看测试集中每一列的缺失比例')
        # df_test.agg(*[(1-(fn.count(c) /fn.count('*'))).alias(c+'_missing') for c in df_test.columns]).show()
        '''
        print("三表关联后的数据保存在hdfs")
        file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'df_concate_test'
        os.system("hadoop fs -rm -r {}".format(file_path))  #os.system(command) 其参数含义如下所示: command 要执行的命令
        df_test.rdd.map(tuple).saveAsPickleFile(file_path)
        print('文件大小如下')
        os.system("hadoop fs -du -s -h  {}".format(file_path))

        file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'df_concate_train'
        os.system("hadoop fs -rm -r {}".format(file_path))  #os.system(command) 其参数含义如下所示: command 要执行的命令
        df_train.rdd.map(tuple).saveAsPickleFile(file_path)
        print("hdfs保存结束")
        print('文件大小如下')
        os.system("hadoop fs -du -s -h  {}".format(file_path))
        '''

        #以下代码会报错:java.net.SocketException: Connection reset
        #报错
        # Total size of serialized results of 162 tasks (1029.9 MB) is bigger than spark.driver.maxResultSize (1024.0 MB)
        print("三表关联后的数据保存到本地")
        localPath = '/data/code/DeepCTR/data/dataForSkearn/'
        df_test.toPandas().to_csv(localPath + "test.csv", index=False)
        df_train.toPandas().to_csv(localPath + "train.csv", index=False)
        print("本地保存结束")
Beispiel #19
0
# Databricks notebook source
from pyspark.sql.functions import expr, pow, col, round, bround, lit, corr
#dataframe 생성
df = spark.read.format("csv")\
  .option("header","true")\
  .option("inferSchema","true")\
  .load("/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-01.csv")
df.createOrReplaceGlobalTempView("dfTable")

#pow
fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"), fabricatedQuantity.alias("RealQuntity")).show(2)
#두개의 column이 모두 수치형 데이터 이므로 계산 가능

#내림
df.select(round(lit("1.6")), bround(lit("1.6"))).show(2)
#두 column의 상관관계
df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice")).show(2)

#describe (평균, 표준편차, 최소,최대,집계)
#통계 스키마는 변경될 수 있으므로 확인용으로만 사용
df.describe().show(6)

#statFunctions package
#stat. 을 통해 접근
ColName = "UnitPrice"
quantiledProbs = [0.5]
relError = 0.05
df.stat.approxQuantile("UnitPrice", quantiledProbs, relError)
Beispiel #20
0
df_filtered = df_ordered.filter(df_ordered.age < 35).filter(df_ordered.age > 33)
df_filtered.show()

Group Data in Spark
In order to make more complex queries, we can also group data by different columns. 
This is done with the “groupBy” statement. 
Basically, this statement also takes just one argument – the column(s) to sort by. 
The following sample is a bit more complex, but I will explain it after the sample:

from pyspark.sql.functions import bround
df_grouped = df_ordered \
    .groupBy(df_ordered.personid) \
    .sum("price") \
    .orderBy("sum(price)") \
    .select("personid", bround("sum(price)", 2)) \
    .withColumnRenamed("bround(sum(price), 2)", "value")
df_grouped.show()

So, what has happened here? We had several steps:

Take the previously ordered dataset and group it by personid
Create the sum of each person’s items
Order everything descending by the column for the sum. NOTE: the column is named “sum(price)” since it is a new column
We round the column “sum(price)” by two decimal points so that it looks nicer. 
Note again, that the name of the column is changed again to “ground(sum(price), 2)”
Since the column is now at a really hard to interpret name, 
we call the “withColumnRenamed” function to give the column a much nicer name.


We call our column “value”
fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show(2)


# COMMAND ----------

df.selectExpr(
  "CustomerId",
  "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2)


# COMMAND ----------

from pyspark.sql.functions import lit, round, bround

df.select(round(lit("2.5")), bround(lit("2.5"))).show(2)


# COMMAND ----------

from pyspark.sql.functions import corr
df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice")).show()


# COMMAND ----------

df.describe().show()


# COMMAND ----------
# COMMAND ----------

# MAGIC %md
# MAGIC 
# MAGIC As part of the analysis of this dataset, it would be useful to have the average rating for each movie. In the following cell of code, I aggregate over the ratings table to get the average rating for each movie ID. 

# COMMAND ----------

from pyspark.sql.types import FloatType
from pyspark.sql.functions import bround
from pyspark.sql.functions import mean

ratings_agg = ratings.groupBy("movieId").agg(mean("rating").alias("avg_rating"))
ratings_agg = ratings_agg.withColumn("average_rating", ratings_agg.avg_rating.cast(FloatType())).drop("avg_rating").withColumnRenamed("average_rating", "avg_rating")
ratings_agg = ratings_agg.select("movieId",bround("avg_rating",2).alias("avg_rating"))
ratings_agg.show()

# COMMAND ----------

# MAGIC %md 
# MAGIC Here we evaluate the average rating by year to identify if there is a trend in the ratings either to decrease or increase over the years. Visually, it is not possible to appreaciate such trend, but it was possible to identify some outlayer values in the year column. 
# MAGIC 
# MAGIC To achieve this, it was necessary to join the aggregated ratings table with movies table that includes the year as a column. 

# COMMAND ----------

joined_movies = movies.join(ratings_agg,"movieId")
joined_movies.select("year",'avg_rating').groupBy("year").mean().orderBy("year").display()

# COMMAND ----------
    ),
    Row(
        Characteristic_Name="trends_sanitizer_Level_3",
        Point=credit_score_trends_sanitizer[2],
    ),
    Row(
        Characteristic_Name="trends_ir_avg_Level_1",
        Point=credit_score_air_avg[0],
    ),
    Row(
        Characteristic_Name="trends_ir_avg_Level_2",
        Point=credit_score_air_avg[1],
    ),
    Row(
        Characteristic_Name="trends_ir_avg_Level_3",
        Point=credit_score_air_avg[2],
    )
])

credit_score_table = credit_score_table.toDF()

credit_score_table = credit_score_table.select(
    "Characteristic_Name",
    F.bround("Point", scale=2).alias("Point"))

credit_score_table.show(30, truncate=False)
'''
credit_score_table.coalesce(1)\
                  .write\
                  .option("header", "true")\
                  .csv("C:/Users/ED/Documents/covid/generate_creditscorecard/credit_score_table.csv")'''
Beispiel #24
0
    def data_explore(self,df_train,df_test):

        sqlContext = SQLContext(self.sc)
        print("duration_time应该根据喜欢和不喜欢来分箱")
        print("查看duration_time的分布")
        print()
        print("------------1、通过时间戳获取年月日时分,(没有工作日特征,月日交叉表示节日特征,年份转化有问题)-----------------")


        #作品发布时间-作品发布的最早时间,转化为day
        time_min = df_train.select(fn.min(df_train['time'])).collect()
        df_train=df_train.withColumn('time_day', ((df_train.time-fn.lit(time_min[0][0])) /fn.lit(3600 * 24)).cast(typ.IntegerType()))
        # df_train=df_train.withColumn('time_strDate',fn.from_unixtime(df_train.time , "yyyy-MM-dd HH:mm:ss"))
        #将 unix 格式的时间戳转换为指定格式的日期,提取小时
        df_train=df_train.withColumn('item_pub_month',fn.from_unixtime(df_train.time , "M").cast(typ.IntegerType()))
        df_train=df_train.withColumn('item_pub_day',fn.from_unixtime(df_train.time , "d").cast(typ.IntegerType()))
        df_train=df_train.withColumn('item_pub_hour',fn.from_unixtime(df_train.time , "k").cast(typ.IntegerType()))
        df_train=df_train.withColumn('item_pub_minute',fn.from_unixtime(df_train.time , "m").cast(typ.IntegerType()))
        print("查看month,day,hour,minute的提取是否正确")
        df_train.show(truncate=False)
        df_train=df_train.drop('time')
        #对时间提取的这部分字段进行count后进行分箱并不明显,就直接当作类别变量处理就可以了,另外增加pos_neg_ratio特征


        df_test=df_test.withColumn('time_day', ((df_test.time-fn.lit(time_min[0][0])) /fn.lit(3600 * 24)).cast(typ.IntegerType()))
        df_test=df_test.withColumn('item_pub_month',fn.from_unixtime(df_test.time , "M").cast(typ.IntegerType()))
        df_test=df_test.withColumn('item_pub_day',fn.from_unixtime(df_test.time , "d").cast(typ.IntegerType()))
        df_test=df_test.withColumn('item_pub_hour',fn.from_unixtime(df_test.time , "k").cast(typ.IntegerType()))
        df_test=df_test.withColumn('item_pub_minute',fn.from_unixtime(df_test.time , "m").cast(typ.IntegerType()))
        df_test=df_test.drop('time')

        print('--------2、统计特征:count、ratio、nunique、ctr相关特征')
        print("计算基础特征和交叉特征的count、类别偏好的ratio")
        count_feats_list = []

        print('single feature count')
        count_feats_list.extend([[c] for c in df_train.columns if c not in ['time', 'channel', 'like', 'finish','dutration_time',"time_day","item_pub_month","item_pub_day","item_pub_hour","item_pub_minute"]])
        print(count_feats_list)

        print('cross count')
        users = ['uid']
        authors = ['item_id', 'user_city', 'author_id', 'item_city', 'channel', 'music_id', 'device','item_pub_hour']
        count_feats_list.extend([[u_col, a_col] for u_col in users for a_col in authors])

        users = ['author_id']
        authors = ['channel', 'user_city', 'item_city', 'music_id',  'item_pub_hour']
        count_feats_list.extend([[u_col, a_col] for u_col in users for a_col in authors])

        count_feats_list.append(['uid', 'user_city', 'channel', 'device'])
        count_feats_list.append(['author_id', 'item_city', 'music_id','item_pub_hour'])
        print("计算count的字段有以下这些")
        print(count_feats_list)

        for i in range(len(count_feats_list)):
           group_cols=count_feats_list[i]
           new_feature = '_'.join(group_cols)
           #判断是几维交叉特征,并进行拼接,再计算每个特征值的个数count,并完成映射
           if len(group_cols)==1:
              if new_feature in ["music_id"] :
                  df1 = df_train.where(df_train[new_feature]!=-1).groupby(new_feature).count()\
                          .withColumnRenamed('count',new_feature+'_count')
              else:
                  df1 = df_train.groupby(new_feature).count()\
                          .withColumnRenamed('count',new_feature+'_count')
              #类别偏好的ratio比例
              count_min = df1.select(fn.min(df1[new_feature+'_count'])).collect()[0][0]
              count_max = df1.select(fn.max(df1[new_feature+'_count'])).collect()[0][0]
              # F.bround("Rank", scale=4)
              df1=df1.withColumn(new_feature+'_count_ratio', fn.bround(((df1[new_feature+'_count']-fn.lit(count_min)) /((fn.lit(count_max)-fn.lit(count_min)).cast(typ.IntegerType()))),scale=3))
              # print("查看df1_1")
              # df1.show(5,truncate=False)
              if new_feature=="device":   #[1.0, 16.0, 46.0, 102.0, 204.0, 410.0, 10389.0] 修改
                 percent_list=[0,10,20,30,40,50,60,70,80,90,100]
              elif new_feature=="author_id":  #[1.0, 2.0, 7.0, 32.0, 78.0, 276186.0]
                  percent_list=[0,50,75,90,95,100]
              elif new_feature=="music_id":   #[1.0, 3.0, 13.0, 73.0, 211.0, 193640.0]
                 percent_list=[0,50,75,90,95,100]   #每个percent_list不相同
              elif new_feature=="uid":       #分箱[1.0, 104.0, 329.0, 741.0, 1131.0, 10389.0]
                  percent_list=[0,50,75,90,95,100]
              elif new_feature=="item_id":   #[1.0, 1.0, 2.0, 7.0, 14.0, 6911.0]  分箱修改
                  percent_list=[0,75,90,95,100]
              elif new_feature=="user_city":  #[1.0, 21935.5, 54519.5, 110179.0, 146319.75, 3789087.0] 修改
                  percent_list=[0,10,20,30,40,50,60,70,80,90,100]
              elif new_feature=="item_city":  #[1.0, 14725.0, 48576.0, 122887.0, 206845.5, 744265.0]  修改
                  percent_list=[0,10,20,30,40,50,60,70,80,90,100]
              else:
                  percent_list=[0,10,20,30,40,50,60,70,80,90,100]

              df1=self.bining(sqlContext,df1,new_feature+'_count',percent_list)
              # print(df1.show(5,truncate=False))
              df_train=df_train.join(df1,new_feature,'left')
              # print("train")
              # df_train.show(5,truncate=False)   #ratio是一个连续变量,范围0-1
              df_test=df_test.join(df1,new_feature,'left')
              # print("test")
              # df_test.show(5,truncate=False)   #ratio是一个连续变量,范围0-1
              del df1
              gc.collect()
           print("输出所有一维特征处理后的结果")
           df_train.show(1,truncate=False)
           df_train.printSchema()
           df_test.show(1,truncate=False)
           df_train.printSchema()

           if len(group_cols)==2:
              print("开始处理2维交叉变量")
              df_train=df_train.withColumn(new_feature, fn.concat_ws('_',df_train[group_cols[0]].cast(typ.StringType()),df_train[group_cols[1]].cast(typ.StringType()))
                                                             )
              df_test=df_test.withColumn(new_feature, fn.concat_ws('_',df_test[group_cols[0]].cast(typ.StringType()),df_test[group_cols[1]].cast(typ.StringType()))
                                                             )
              df2 = df_train.groupby(new_feature).count()\
                     .withColumnRenamed('count',new_feature+'_count')
              #类别偏好的ratio比例
              count_min = df2.select(fn.min(df2[new_feature+'_count'])).collect()[0][0]
              count_max = df2.select(fn.max(df2[new_feature+'_count'])).collect()[0][0]
              # F.bround("Rank", scale=4)
              df2=df2.withColumn(new_feature+'_count_ratio', fn.bround(((df2[new_feature+'_count']-fn.lit(count_min)) /((fn.lit(count_max)-fn.lit(count_min)).cast(typ.IntegerType()))),scale=3))
              # print("查看df1_1")
              # df2.show(5,truncate=False)
              if new_feature=="uid_item_id":
                 percent_list=[0,20,35,50,65,85,100]   #每个percent_list不相同
              else:
                 percent_list=[0,50,75,90,95,100]
              # elif new_feature=="uid_user_city":
              #     percent_list=[0,50,75,90,95,100]
              # elif new_feature=="uid_author_id":
              #    percent_list=[0,50,75,90,95,100]   #每个percent_list不相同
              # elif new_feature=="uid_item_city":
              #     percent_list=[0,50,75,90,95,100]
              # elif new_feature=="uid_channel":
              #     percent_list=[0,50,75,90,95,100]
              # elif new_feature=="uid_music_id":
              #     percent_list=[0,50,75,90,95,100]
              # elif new_feature=="uid_device":
              #     percent_list=[0,50,75,90,95,100]
              # elif new_feature=="uid_time_pub_hour":
              #     percent_list=[0,50,75,90,95,100]

              # ['uid', 'item_id'], ['uid', 'user_city'], ['uid', 'author_id'], ['uid', 'item_city'], ['uid', 'channel'], ['uid', 'music_id'],
              #  ['uid', 'device'], ['uid', 'time_pub_hour']
              #['author_id', 'channel'], ['author_id', 'user_city'], ['author_id', 'item_city'], ['author_id', 'music_id'], ['author_id', 'time_pub_hour']

              df2=self.bining(sqlContext,df2,new_feature+'_count',percent_list)
              print("查看df2_2")
              df2.show(5,truncate=False)
              df_train=df_train.join(df2,new_feature,'left')
              # print("train")
              # df_train.show(5,truncate=False)   #ratio是一个连续变量,范围0-1
              df_test=df_test.join(df2,new_feature,'left')
              # print("test")
              # df_test.show(5,truncate=False)


           if len(group_cols)==4:
              print("开始处理4维交叉变量")
              df_train=df_train.withColumn(new_feature, fn.concat_ws('_',df_train[group_cols[0]].cast(typ.StringType()),df_train[group_cols[1]].cast(typ.StringType()),
                                                             df_train[group_cols[2]].cast(typ.StringType()),df_train[group_cols[3]].cast(typ.StringType()))
                                                           )
              df_test=df_test.withColumn(new_feature, fn.concat_ws('_',df_test[group_cols[0]].cast(typ.StringType()),df_test[group_cols[1]].cast(typ.StringType()),
                                                             df_test[group_cols[2]].cast(typ.StringType()),df_test[group_cols[3]].cast(typ.StringType()))
                                                           )

              df3 = df_train.groupby(new_feature).count()\
                     .withColumnRenamed('count',new_feature+'_count')

              #类别偏好的ratio比例
              count_min = df3.select(fn.min(df3[new_feature+'_count'])).collect()[0][0]
              count_max = df3.select(fn.max(df3[new_feature+'_count'])).collect()[0][0]
              # F.bround("Rank", scale=4)
              df3=df3.withColumn(new_feature+'_count_ratio', fn.bround(((df3[new_feature+'_count']-fn.lit(count_min)) /((fn.lit(count_max)-fn.lit(count_min)).cast(typ.IntegerType()))),scale=3))
              # print("查看df3_1")
              # df3.show(5,truncate=False)
              percent_list=[0,50,75,90,95,100]
              df3=self.bining(sqlContext,df3,new_feature+'_count',percent_list)
              print("查看df3_2")
              df3.show(5,truncate=False)
              df_train=df_train.join(df3,new_feature,'left')
              # print("train")
              # df_train.show(5,truncate=False)
              # ['uid', 'user_city', 'channel', 'device'], ['author_id', 'item_city', 'music_id', 'time_pub_hour']
              df_test=df_test.join(df3,new_feature,'left')
              # print("test")
              # df_test.show(5,truncate=False)
        # df.show(5,truncate=False)
        print("删除没有必要的列")
        unuse_col=['item_city','user_city','device','author_id','music_id',]  #'uid','item_id'这两列不能删除,后面提交结果的时候应该要用到
        df_train=self.dropUnuseCols(df_train,unuse_col)
        df_test=self.dropUnuseCols(df_test,unuse_col)

        print("表中含有为null的字段,主要产生在leftjoin的时候")
        print("这一步先不做,三表联合的时候会填充")
        # df_train=df_train.na.fill(-1)
        # df_test=df_test.na.fill(-1)

        print("查看train的统计信息")
        desc = df_train.describe()
        desc.show()
        print("查看test的统计信息")
        desc = df_test.describe()
        desc.show()


        print('-------5.保存数据预处理结果-------')
        test_file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'actLog_test_new'
        os.system("hadoop fs -rm -r {}".format(test_file_path))
        df_test.rdd.map(tuple).saveAsPickleFile(test_file_path)

        del df_test
        gc.collect()

        train_file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'actLog_train_new'
        os.system("hadoop fs -rm -r {}".format(train_file_path))  #os.system(command) 其参数含义如下所示: command 要执行的命令
        df_train.rdd.map(tuple).saveAsPickleFile(train_file_path)




        '''