Exemple #1
0
def comparacion(cx, cy):
    # Definimos el radio
    r = 0.01
    # Cogemos los puntos de tráfico dentro de un radio r
    imds = trafico_medio.filter(
        fabs(trafico_medio["coord_x"] - cx) +
        fabs(trafico_medio["coord_y"] - cy) < r)

    # Devolvemos la media aritmética de las intensidades medias
    return imds.select(fsum("IMD")).collect()[0][0] / imds.count()
def get_total_traffic(hive_context):
    command = """
        SELECT
        T1.imp,
        T1.si 
        FROM {} AS T1
        """.format(cfg['pre_cluster_table'])

    df = hive_context.sql(command)
    result = df.agg(fsum('imp')).take(1)[0]['sum(imp)']
    return result
def get_si_traffic(cfg, hive_context, si):
    command = """
        SELECT
        T1.imp,
        T1.si 
        FROM {} AS T1 
        WHERE T1.si='{}'
        """.format(cfg['pre_cluster_table'], si)

    df = hive_context.sql(command)
    result = df.agg(fsum('imp')).take(1)[0]['sum(imp)']
    return result
Exemple #4
0
                                 col('date2').alias('date'), 'quantity',
                                 'amount'))


def subtract_ending_date(date):
    ending_date = datetime(2013, 7, 27, 0, 0)
    return (ending_date.date() - date).days


udf_subtract_ending_date = udf(subtract_ending_date, IntegerType())

ending_date = datetime(2013, 7, 27, 0, 0)
one_year_ago = ending_date - relativedelta(years=1)
rfm = (transactions.where(col('date') >= one_year_ago).groupBy('id').agg(
    fmax('date'),
    fsum('quantity').alias('frequency'),
    fmean('amount').alias('monetization')).withColumn(
        'recency', udf_subtract_ending_date(col('max(date)'))).select(
            'id', 'recency', 'frequency', 'monetization'))
vect_rfm = VectorAssembler(inputCols=['recency'],
                           outputCol='recency_v').transform(rfm)
vect_rfm = VectorAssembler(inputCols=['frequency'],
                           outputCol='frequency_v').transform(vect_rfm)
vect_rfm = VectorAssembler(inputCols=['monetization'],
                           outputCol='monetization_v').transform(vect_rfm)
scaled_rfm = MinMaxScaler(
    inputCol='recency_v',
    outputCol='recency_s').fit(vect_rfm).transform(vect_rfm)
scaled_rfm = MinMaxScaler(
    inputCol='frequency_v',
    outputCol='frequency_s').fit(scaled_rfm).transform(scaled_rfm)
            .withColumn('partition_id', spark_partition_id()) \
            .withColumn('idx', dense_rank().over(w1))

    # get number of unique rows (based on Address+LNAME) which is max_idx
    # and then grab the running SUM of this cnt -> rcnt
    # partition_id: spark partition id
    # idx: calculated in-partition id
    # cnt: number of unique ids in the same partition fmax('idx')
    # rcnt: starting_id for a partition(something like a running count): coalesce(fsum('cnt').over(w1),lit(0))
    # w1: WindowSpec to calculate the above rcnt
    w2 = Window.partitionBy().orderBy('partition_id').rowsBetween(
        Window.unboundedPreceding, -1)

    df2 = df1.groupby('partition_id') \
             .agg(fmax('idx').alias('cnt')) \
             .withColumn('rcnt', coalesce(fsum('cnt').over(w2),lit(0)))

    df2.show()
    #+------------+---+----+
    #|partition_id|cnt|rcnt|
    #+------------+---+----+
    #|           0|  2|   0|
    #|           1|  1|   2|
    #|           2|  2|   3|
    #|           4|  1|   5|
    #+------------+---+----+
    """join df1 with df2 and create id = idx + rcnt"""
    df_new = df1.join(broadcast(df2),
                      on=['partition_id']).withColumn('id',
                                                      col('idx') + col('rcnt'))
Exemple #6
0
             'FLIGHTS', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY',
             'LATE_AIRCRAFT_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'WHEELS_ON',
             'TAXI_IN', 'CANCELLED', 'AIR_TIME', 'DISTANCE', 'ARR_DELAY_NEW',
             'DEP_DELAY_NEW')

# In[5]:

df.show(4)
#df.write.csv('file:///Users/lavidhaliwal/mycsv.csv')

# In[6]:

#df.summary()
from pyspark.sql.functions import col
from pyspark.sql.functions import sum as fsum
y = df.select(*(fsum(col(c).isNull().cast("int")).alias(c)
                for c in df.columns))
y1 = df.count()

# In[7]:

z = y.collect()

# In[8]:

dep_d = z[0].DEP_DELAY
print(dep_d * 100 / y1)
print(y1)

# In[9]: