def comparacion(cx, cy): # Definimos el radio r = 0.01 # Cogemos los puntos de tráfico dentro de un radio r imds = trafico_medio.filter( fabs(trafico_medio["coord_x"] - cx) + fabs(trafico_medio["coord_y"] - cy) < r) # Devolvemos la media aritmética de las intensidades medias return imds.select(fsum("IMD")).collect()[0][0] / imds.count()
def get_total_traffic(hive_context): command = """ SELECT T1.imp, T1.si FROM {} AS T1 """.format(cfg['pre_cluster_table']) df = hive_context.sql(command) result = df.agg(fsum('imp')).take(1)[0]['sum(imp)'] return result
def get_si_traffic(cfg, hive_context, si): command = """ SELECT T1.imp, T1.si FROM {} AS T1 WHERE T1.si='{}' """.format(cfg['pre_cluster_table'], si) df = hive_context.sql(command) result = df.agg(fsum('imp')).take(1)[0]['sum(imp)'] return result
col('date2').alias('date'), 'quantity', 'amount')) def subtract_ending_date(date): ending_date = datetime(2013, 7, 27, 0, 0) return (ending_date.date() - date).days udf_subtract_ending_date = udf(subtract_ending_date, IntegerType()) ending_date = datetime(2013, 7, 27, 0, 0) one_year_ago = ending_date - relativedelta(years=1) rfm = (transactions.where(col('date') >= one_year_ago).groupBy('id').agg( fmax('date'), fsum('quantity').alias('frequency'), fmean('amount').alias('monetization')).withColumn( 'recency', udf_subtract_ending_date(col('max(date)'))).select( 'id', 'recency', 'frequency', 'monetization')) vect_rfm = VectorAssembler(inputCols=['recency'], outputCol='recency_v').transform(rfm) vect_rfm = VectorAssembler(inputCols=['frequency'], outputCol='frequency_v').transform(vect_rfm) vect_rfm = VectorAssembler(inputCols=['monetization'], outputCol='monetization_v').transform(vect_rfm) scaled_rfm = MinMaxScaler( inputCol='recency_v', outputCol='recency_s').fit(vect_rfm).transform(vect_rfm) scaled_rfm = MinMaxScaler( inputCol='frequency_v', outputCol='frequency_s').fit(scaled_rfm).transform(scaled_rfm)
.withColumn('partition_id', spark_partition_id()) \ .withColumn('idx', dense_rank().over(w1)) # get number of unique rows (based on Address+LNAME) which is max_idx # and then grab the running SUM of this cnt -> rcnt # partition_id: spark partition id # idx: calculated in-partition id # cnt: number of unique ids in the same partition fmax('idx') # rcnt: starting_id for a partition(something like a running count): coalesce(fsum('cnt').over(w1),lit(0)) # w1: WindowSpec to calculate the above rcnt w2 = Window.partitionBy().orderBy('partition_id').rowsBetween( Window.unboundedPreceding, -1) df2 = df1.groupby('partition_id') \ .agg(fmax('idx').alias('cnt')) \ .withColumn('rcnt', coalesce(fsum('cnt').over(w2),lit(0))) df2.show() #+------------+---+----+ #|partition_id|cnt|rcnt| #+------------+---+----+ #| 0| 2| 0| #| 1| 1| 2| #| 2| 2| 3| #| 4| 1| 5| #+------------+---+----+ """join df1 with df2 and create id = idx + rcnt""" df_new = df1.join(broadcast(df2), on=['partition_id']).withColumn('id', col('idx') + col('rcnt'))
'FLIGHTS', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'WHEELS_ON', 'TAXI_IN', 'CANCELLED', 'AIR_TIME', 'DISTANCE', 'ARR_DELAY_NEW', 'DEP_DELAY_NEW') # In[5]: df.show(4) #df.write.csv('file:///Users/lavidhaliwal/mycsv.csv') # In[6]: #df.summary() from pyspark.sql.functions import col from pyspark.sql.functions import sum as fsum y = df.select(*(fsum(col(c).isNull().cast("int")).alias(c) for c in df.columns)) y1 = df.count() # In[7]: z = y.collect() # In[8]: dep_d = z[0].DEP_DELAY print(dep_d * 100 / y1) print(y1) # In[9]: