Beispiel #1
0
def runDateFunctions(spark):
    # 파이썬의 경우 아래와 같이 튜플을 이용하여 데이터프레임을 생성하는 것도 가능함
    df1 = spark.createDataFrame([(1.512,), (2.234,), (3.42,)], ['value'])
    df2 = spark.createDataFrame([(25.0,), (9.0,), (10.0,)], ['value'])

    df1.select(round(df1["value"], 1)).show()
    df2.select(functions.sqrt('value')).show()
Beispiel #2
0
def compute(day):
    # On veut les jours day-30 à day-1
    sums = wikipediadata.where(
            (wikipediadata.day >= day-30) & (wikipediadata.day <= day-1))

    # Sous-ensemble de test
    #sums = sums.where((sums.page == 'Cadillac_Brougham') | ((sums.page == 'Roald_Dahl') & (sums.projectcode == 'fr')))

    # On somme les heures de la journées
    sums = sums.groupby('projectcode', 'page', 'day').sum('views')
    # On cache pour plus tard
    sums.cache()

    # on définit une windows := jour precedent
    window_spec =  Window.partitionBy(sums.projectcode, sums.page) \
            .orderBy(sums.day.asc()).rowsBetween(-1, -1)

    # on calcule la différence entre views(d) - views(d-1)
    diffs = sums.withColumn('diff', sums.views - F.sum(sums.views) \
            .over(window_spec))

    # on calcule les coefs à appliquer à chaque jour
    coefs = pd.DataFrame({'day': range(day-30, day)})
    coefs['coef'] = 1. / (day - coefs.day)

    coefs = hc.createDataFrame(coefs)
    diffs = diffs.join(coefs, 'day')

    # on calcul le score de chaque jour
    diffs = diffs.withColumn('sub_score', diffs.diff * diffs.coef)

    totals = diffs.groupby('projectcode', 'page').sum('views', 'sub_score')
    # on normalise par la racine de la somme des views 
    totals = totals.withColumn('score',
            totals['SUM(sub_score)'] / F.sqrt(totals['SUM(views)'])) \
            .orderBy(F.desc('score')) \
            .withColumnRenamed('SUM(views)', 'total_views') \
            .limit(10)

    views = sums.select('projectcode', 'page', 'day', 'views') \
           .join(totals.select('projectcode', 'page', 'total_views', 'score'), 
                  (totals.projectcode == sums.projectcode) & (totals.page == sums.page), 'right_outer')

    df = totals.select('projectcode', 'page', 'total_views', 'score').toPandas()
    df2 = views.toPandas()
    df2 = df2.iloc[:, 2:]
    df2 = df2.pivot_table(values='views', columns=['day'], index=['projectcode', 'page'], fill_value=0)
    df = df.merge(df2, left_on=['projectcode', 'page'], right_index=True)
    df.to_csv(filename(day), index=False)
    
    # on vide le cache
    hc.clearCache()
def geodistance(df, target_name,lng1, lat1, lng2, lat2):
  result =  df.withColumn("dlon", radians(col(lng1)) - radians(col(lng2))) \
    .withColumn("dlat", radians(col(lat1)) - radians(col(lat2))) \
    .withColumn(target_name, asin(sqrt(sin(col("dlat") / 2) ** 2 + cos(radians(col(lat2)))* cos(radians(col(lat1))) * sin(col("dlon") / 2) ** 2)) * 2 * 3963 * 5280) \
    .drop("dlon", "dlat")
  return result
Beispiel #4
0
def processing_loop(spark_master, input_queue, output_queue, wikieod_file):
    """Create a model and process requests for new predictions.

    This function is the heart of the application. It accepts a URL to
    a Spark master, multiprocessing input and output Queue objects, and
    the location of the end of day stock data in parquet format. With
    this information it will load the end of day data and create a
    model to base future predictions upon.

    After creating the model, it will enter a blocking loop waiting for
    new prediction requests to arrive. After receiving a new request,
    it will simulate the requested stock predictions and place the
    results into the output queue.

    It is important to note that this function will run as a separate
    process started by the main function. This is done to isolate the
    Spark processing components from the thread of execution that is
    running the Flask web server. In this manner the application will
    be reactive to incoming input without blocking on the processing
    activity.
    """
    # import these here to allow the debug mode to function properly in the
    # absence of spark
    import pyspark
    from pyspark import sql as pysql
    from pyspark.sql import functions as pyfuncs

    spark = pysql.SparkSession.builder.master(spark_master).getOrCreate()
    sc = spark.sparkContext

    output_queue.put('ready')

    df = spark.read.load(wikieod_file)
    ddf = df.select('ticker', 'date', 'close').withColumn(
        'change', (pyfuncs.col('close') / pyfuncs.lag('close', 1).over(
            pysql.Window.partitionBy('ticker').orderBy(df['date'])) - 1.0) *
        100)

    mv = ddf.groupBy('ticker').agg(
        pyfuncs.avg('change').alias('mean'),
        pyfuncs.sqrt(pyfuncs.variance('change')).alias('stddev'))

    dist_map = mv.rdd.map(lambda r: (r[0], (r[1], r[2]))).collectAsMap()

    priceDF = ddf.orderBy('date', ascending=False).groupBy('ticker').agg(
        pyfuncs.first('close').alias('price'),
        pyfuncs.first('date').alias('date'))
    prices = priceDF.rdd.map(lambda r: (r[0], r[1])).collectAsMap()

    while True:
        req = input_queue.get()
        portfolio = {}
        for stock in req['stocks']:
            portfolio[stock['symbol']] = (prices[stock['symbol']] *
                                          stock['quantity'])

        seed_rdd = sc.parallelize(seeds(10000))
        bparams = sc.broadcast(dist_map)
        bpf = sc.broadcast(portfolio)
        initial_value = portfolio_value(portfolio)
        results = seed_rdd.map(lambda s: portfolio_value(
            simulate(s, bpf.value, bparams.value, req['days'])) - initial_value
                               )
        simulated_results = list(zip(results.collect(), seed_rdd.collect()))
        simulated_values = [v for (v, _) in simulated_results]
        simulated_values.sort()
        num_samples = req['simulations'] if req['simulations'] < 100 else 100
        prediction = [
            simulated_values[int(len(simulated_values) * i / num_samples)]
            for i in range(num_samples)
        ]
        percentage_var = 0.05
        fivepercent = '{:0.2f}'.format(simulated_values[int(
            len(simulated_values) * percentage_var)])
        req.update({
            'status': 'ready',
            'fivepercent': fivepercent,
            'prediction': prediction
        })
        output_queue.put(req)
Beispiel #5
0
def compile_sqrt(t, expr, scope, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)
    return F.sqrt(src_column)
Beispiel #6
0
def select_acc_intervals(df, ts_name, interval, window, incl_vect=False, incl_acc=False):
    """
    Filter DataFrame with a new epoch duration.

    :param df:
        Spark DataFrame object with timestamp data
    :param ts_name:
        column with timestamp data
    :param interval:
        initial epoch duration (in seconds)
    :param window:
        new epoch duration (in seconds)
    :param incl_vect:
        if true, calculate vector magnitude and include it in the DataFrame
    :param incl_acc:
        if true, all raw accelerometer data are included in the DataFrame
    :return:
        Spark DataFrame object with timestamp data
    """

    ## the window must be larger tha a single epoch
    assert interval <= 60, "Epoch larger than 1 minute."
    assert window >= interval, "Window smaller than epoch."

    cols = df.columns
    selected_cols = ['axis1', 'axis2', 'axis3', 'steps']  # TODO: add eeAccumulator

    minp = df.select(F.min(ts_name).cast('long')).first()[0]

    if interval < window:

        df2 = df.withColumn('tmp', F.row_number().over(Window.orderBy(ts_name)) - 1)

        df2 = df2.withColumn('total_sec', F.col(ts_name).cast('long')).cache()
        df2 = df2.checkpoint()
        df2.count()

        for col in selected_cols:

            df2 = df2.withColumn(col, F.when(((F.col('total_sec') - minp) % window == 0),
                                             F.sum(col).over(Window.orderBy('total_sec')
                                                             .rangeBetween(0, window - interval)
                                                             )
                                             ).otherwise(0)
                                 )

        df2 = df2.withColumn('duration', F.col(ts_name).cast(IntegerType()) -
                             F.lag(F.col(ts_name).cast(IntegerType()), 1, minp)
                             .over(Window.orderBy(ts_name))
                             ).drop('total_sec')

        df2 = df2.withColumn('tmp', (F.col('tmp') * F.col('duration')) % window).drop('duration').orderBy(ts_name)

        df2 = df2.filter(F.col('tmp') == 0).drop('tmp').orderBy(ts_name)

    else:

        df2 = df

    if incl_vect:

        df2 = df2.withColumn('vectMag', F.sqrt(F.col('axis1') ** 2 + F.col('axis2') ** 2 + F.col('axis3') ** 2))

        cols.insert(1, 'vectMag')

        df2 = df2.select(cols).orderBy(ts_name)

    if not incl_acc:

        df2 = df2.select(ts_name, cols[1])

    return df2
    # explodes all array columns
    d1 = d.withColumn("new", F.arrays_zip("heart_rate","timestamp","latitude","longitude","lat2","long2","time2"))\
            .withColumn("new", F.explode("new"))\
            .select("userId","id",
                    F.col("new.heart_rate").alias("heart_rate"),
                    F.col("new.timestamp").alias("timestamp"),
                    F.col("new.latitude").alias("lat"),
                    F.col("new.longitude").alias("long"),
                    F.col("new.lat2").alias("lat2"),
                    F.col("new.long2").alias("long2"),
                    F.col("new.time2").alias("time2"))

    # haversine formula, calculates distance and speed between two points
    d2 = d1.withColumn("distance", 3956 *(2 * F.asin(F.sqrt(F.sin((F.radians("lat") - F.radians("lat2"))/2)**2
                                    + F.cos(F.radians("lat")) * F.cos(F.radians("lat2")) * F.sin((F.radians("long")
                                    - F.radians("long2"))/2)**2))))\
           .withColumn("speed", F.col("distance")/((F.col("timestamp") - F.col("time2"))/3600))

    d2 = d2.fillna({"speed": "0"})

    # aggregations that compute metrics related to an individual bike trip
    query = d2.groupBy("id", "userid").agg(
        F.round(F.mean("speed"), 2).alias("avgspeed"),
        F.round(F.max("speed"), 2).alias("max_speed"),
        F.round(F.mean("heart_rate")).cast("integer").alias("avg_heart_rate"),
        F.max("heart_rate").alias("max_heart_rate"),
        F.round(F.sum("distance"), 2).alias("distance"),
        conv_sec_udf(F.last("timestamp") -
                     F.first("timestamp")).alias("duration"),
        conv_sec_udf((F.last("timestamp") - F.first("timestamp")) /
Beispiel #8
0



f_section={"U_sequence_square":'sum', "U_sequence":'sum',"Interval_int":'count',"avg(PRICE)":'mean'}

df_section=df8.groupby(["SYMBOL","DATE"]).agg(f_section)

df_section=df_section.withColumnRenamed('avg(avg(PRICE))', 'daily_average_price')
df_section=df_section.withColumnRenamed('count(Interval_int)', 'n')
df_section=df_section.withColumnRenamed('sum(U_sequence)', 'U_sum')
df_section=df_section.withColumnRenamed('sum(U_sequence_square)', 'U_squre_sum')



df_section=df_section.withColumn("Section_volatility",sqrt(col('U_squre_sum')/col('n')) - col('U_sum')*col('U_sum')/(col('n')*(col('n')-1)))

df_section=df_section.withColumnRenamed('U_squre_sum', 'Realized_volatility')


import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab



company_list=df_section.select(df_section.SYMBOL).distinct().collect()

company_list_panda=pd.DataFrame(company_list)
company_plot=dict()
date_plot=dict()
Beispiel #9
0
def distance(CLat, CLon, data, col_name):
    return data.withColumn('CLon', f.lit(CLon)).withColumn('CLat',f.lit(CLat)).withColumn("dlon", f.radians(f.col("CLon")) - f.radians(f.col("longitude"))).withColumn("dlat", f.radians(f.col("CLat")) - f.radians(f.col("latitude"))).withColumn(col_name, f.asin(f.sqrt(
                                         f.sin(f.col("dlat") / 2) ** 2 + f.cos(f.radians(f.col("latitude")))
                                         * f.cos(f.radians(f.col("CLat"))) * f.sin(f.col("dlon") / 2) ** 2
                                         )
                                    ) * 2 * 6371 * 1000) \
              .drop("dlon", "dlat",'CLon', 'CLat')
Beispiel #10
0
df_join = df_join.withColumn(
    'longitude_distance',
    functions.radians(over_station_coord['near_longitude']) -
    functions.radians(short_station_coord['start_longitude']))

df_join = df_join.withColumn(
    'a',
    (pow(functions.sin('latitude_distance'), 2) +
     functions.cos(functions.radians(short_station_coord['start_latitude'])) *
     functions.cos(functions.radians(over_station_coord['near_latitude'])) *
     (pow(functions.sin('longitude_distance'), 2))))

df_join = df_join.withColumn(
    'distance',
    6373 * 2 * functions.atan2(sqrt(df_join['a']), sqrt(1 - df_join['a'])))

# distance less than 3 km
#df_join = df_join.filter(df_join['distance'] < 3)

df_join = df_join.select('date', 'hour', 'start_station_name',
                         'near_station_name', 'distance')

df_join = df_join.dropDuplicates(
    ['date', 'hour', 'start_station_name', 'near_station_name'])

df_join = df_join.orderBy('date', 'hour',
                          'distance').select('date', 'hour',
                                             'start_station_name',
                                             'near_station_name')
def ratingBasedMetrics(ratings):
    ratings_quad = ratings.select(
        "user_id", "business_id", "stars").withColumn(
            "stars_quad",
            col("stars") * col("stars")).alias("user_business_rating")
    sum_stars = ratings_quad.groupBy("user_id").agg(
        sum_sql("stars_quad").alias("sum_quad_stars"),
        count(lit(1)).alias("nr")
    ) \
        .alias("user_business_stars_quad")

    ratings_sum = ratings_quad.join(sum_stars,
                                    "user_id").select("business_id", "user_id",
                                                      "stars", "stars_quad",
                                                      "sum_quad_stars", "nr")

    all_pairs = ratings_sum.join(
        ren(ratings_sum, ["business_id"]),
        "business_id").filter(col("user_id") < col("user_id_2"))

    cosine_data = all_pairs.groupBy(
        "user_id", "user_id_2", "sum_quad_stars", "sum_quad_stars_2").agg(
            sum_sql("stars").alias("sum_stars"),
            sum_sql("stars_2").alias("sum_stars_2"),
            sum_sql(col("stars") * col("stars_2")).alias("sum_xy"),
            sum_sql((col("stars") - col("stars_2")) *
                    (col("stars") - col("stars_2"))).alias("sumxy_diff_quad"))
    cosine_rating = cosine_data.withColumn(
        "cosine_rating",
        ((col("sum_xy")) / (sqrt("sum_quad_stars") * sqrt("sum_quad_stars_2"))
         ).cast("float")).select(
             "user_id", "user_id_2",
             "cosine_rating").filter(col("cosine_rating") > 0)

    item_count = ratings.select("business_id").distinct().count()
    item_count_sqrt = math.sqrt(item_count)

    dfDiff = all_pairs.withColumn("diff", (col("stars") - col("stars_2")) *
                                  (col("stars") - col("stars_2")) -
                                  col("stars_quad") - col("stars_quad_2"))

    euclidean = dfDiff.groupBy(
        "user_id", "user_id_2", "sum_quad_stars",
        "sum_quad_stars_2").agg(sum_sql("diff").alias("sum_diff")).withColumn(
            "diff_quad",
            col("sum_diff") + col("sum_quad_stars") + col("sum_quad_stars_2"))

    euclidean_rating = euclidean.withColumn(
        "euclidean_rating",
        (1 / (1 + sqrt("diff_quad") / item_count_sqrt)).cast("float")).select(
            "user_id", "user_id_2",
            "euclidean_rating").filter(col("euclidean_rating") > 0)

    intersection = all_pairs.groupBy("user_id", "user_id_2", "nr", "nr_2").agg(
        count(lit(1)).alias("intersection"))
    jaccard_rating = intersection.withColumn("jaccard_rating", (
        col("intersection") /
        (col("nr") + col("nr_2") - col("intersection"))).cast("float")).select(
            "user_id", "user_id_2",
            "jaccard_rating").filter(col("jaccard_rating") > 0)

    mean_ratings = ratings_quad.groupBy("user_id").agg(
        mean("stars").alias("mean_stars")).alias("mean_ratings")

    centered_stars = ratings_quad.join(mean_ratings, "user_id").withColumn(
        "centered_stars",
        col("stars") - col("mean_stars")).withColumn(
            "centered_quad_stars",
            col("centered_stars") * col("centered_stars"))

    centered_stars_sums = centered_stars.groupBy("user_id").agg(sum_sql("centered_stars").alias("sum_centered_stars"),
                                                                sum_sql("centered_quad_stars").alias(
                                                                    "sum_centered_quad_stars")) \
        .alias("centered_stars_sums")

    centered_stars = centered_stars.join(centered_stars_sums, "user_id")
    centered_stars = centered_stars.join(
        ren(centered_stars, ["business_id"]),
        "business_id").filter(col("user_id") < col("user_id_2"))

    centered_grouped = centered_stars.groupBy("user_id", "user_id_2", "sum_centered_quad_stars",
                                              "sum_centered_quad_stars_2").agg(
        sum_sql(col("centered_stars") * col("centered_stars_2")).alias("sum_xy_centered")) \
        .alias("centered_sum_quad")

    pearson_rating = centered_grouped.withColumn(
        "pearson_rating",
        ((col("sum_xy_centered")) /
         (sqrt("sum_centered_quad_stars") *
          sqrt("sum_centered_quad_stars_2"))).cast("float")).select(
              "user_id", "user_id_2",
              "pearson_rating").filter(col("pearson_rating") > 0)

    return cosine_rating.join(jaccard_rating,
                              ["user_id", "user_id_2"], "outer").join(
                                  euclidean_rating, ["user_id", "user_id_2"],
                                  "outer").join(pearson_rating,
                                                ["user_id", "user_id_2"],
                                                "outer")
    def linearReg(self, dataset_add, feature_colm, label_colm, relation_list,
                  relation, userId, locationAddress):
        try:
            dataset = spark.read.parquet(dataset_add)
            dataset.show()

            label = ''
            for val in label_colm:
                label = val
            #ETL part
            Schema = dataset.schema
            stringFeatures = []
            numericalFeatures = []
            for x in Schema:
                if (str(x.dataType) == "StringType"
                        or str(x.dataType) == 'TimestampType'
                        or str(x.dataType) == 'DateType'
                        or str(x.dataType) == 'BooleanType'
                        or str(x.dataType) == 'BinaryType'):
                    for y in feature_colm:
                        if x.name == y:
                            dataset = dataset.withColumn(
                                y, dataset[y].cast(StringType()))
                            stringFeatures.append(x.name)
                else:
                    for y in feature_colm:
                        if x.name == y:
                            numericalFeatures.append(x.name)

            if relation == 'linear':
                dataset = dataset
            if relation == 'non_linear':
                dataset = Relationship(dataset, relation_list)

            categoryColmList = []
            categoryColmListFinal = []
            categoryColmListDict = {}
            countOfCategoricalColmList = []
            for value in stringFeatures:
                categoryColm = value
                listValue = value
                listValue = []
                categoryColm = dataset.groupby(value).count()
                countOfCategoricalColmList.append(categoryColm.count())
                categoryColmJson = categoryColm.toJSON()
                for row in categoryColmJson.collect():
                    categoryColmSummary = json.loads(row)
                    listValue.append(categoryColmSummary)
                categoryColmListDict[value] = listValue

            if not stringFeatures:
                maxCategories = 5
            else:
                maxCategories = max(countOfCategoricalColmList)
            for x in Schema:
                if (str(x.dataType) == "StringType" and x.name == label):
                    for labelkey in label_colm:
                        label_indexer = StringIndexer(
                            inputCol=label,
                            outputCol='indexed_' + label,
                            handleInvalid="skip").fit(dataset)
                        dataset = label_indexer.transform(dataset)
                        label = 'indexed_' + label
                else:
                    label = label
            indexed_features = []
            # encodedFeatures = []
            for colm in stringFeatures:
                indexer = StringIndexer(inputCol=colm,
                                        outputCol='indexed_' + colm,
                                        handleInvalid="skip").fit(dataset)
                indexed_features.append('indexed_' + colm)
                dataset = indexer.transform(dataset)
            '''from pyspark.ml.feature import OneHotEncoderEstimator
                oneHotEncodedFeaturesList = []
                for colm in stringFeatures:
                        indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm, handleInvalid="skip").fit(dataset)
                        indexed_features.append('indexed_' + colm)
                        dataset = indexer.transform(dataset)
                        oneHotEncodedFeaturesList.append('OneHotEncoded_' + colm)
                oneHotEncoder=OneHotEncoderEstimator(inputCols=indexed_features,
                                                     outputCols=oneHotEncodedFeaturesList)
                oneHotEncoderFit=oneHotEncoder.fit(dataset)
                oneHotEncoderFeaturesDataset=oneHotEncoderFit.transform(dataset)'''
            featureAssembler = VectorAssembler(inputCols=indexed_features +
                                               numericalFeatures,
                                               outputCol='features',
                                               handleInvalid="skip")
            dataset = featureAssembler.transform(dataset)
            vectorIndexer = VectorIndexer(inputCol='features',
                                          outputCol='vectorIndexedFeatures',
                                          maxCategories=maxCategories,
                                          handleInvalid="skip").fit(dataset)
            dataset = vectorIndexer.transform(dataset)
            trainDataRatioTransformed = self.trainDataRatio
            testDataRatio = 1 - trainDataRatioTransformed
            train_data, test_data = dataset.randomSplit(
                [trainDataRatioTransformed, testDataRatio], seed=40)

            lr = LinearRegression(featuresCol="vectorIndexedFeatures",
                                  labelCol=label)
            regressor = lr.fit(train_data)
            # locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/'

            print("coefficient : " + str(regressor.coefficients))
            coefficient_t = str(regressor.coefficients)
            print("intercept : " + str(regressor.intercept))
            intercept_t = str(regressor.intercept)
            featurePredictedLabel = feature_colm
            featurePredictedLabel.append('prediction')
            featurePredictedLabel.append(label)
            # testDataEvaluation = regressor.evaluate(test_data)
            # testDataPrediction = testDataEvaluation.predictions
            # testDataPrediction.select(featurePredictedLabel).show()

            prediction = regressor.evaluate(test_data)
            prediction_val = prediction.predictions
            testDataPrediction = prediction_val.select(featurePredictedLabel)

            # storing test predicted value to the dataset

            prediction_val_pand = prediction_val.select(
                label, "prediction").toPandas()
            prediction_val_pand = prediction_val_pand.assign(
                residual_vall=prediction_val_pand[label] -
                prediction_val_pand["prediction"])

            prediction_val_pand_residual = prediction_val_pand["residual_vall"]
            prediction_val_pand_label = prediction_val_pand[label]
            prediction_val_pand_predict = prediction_val_pand["prediction"]
            lr_prediction = regressor.transform(test_data)
            lr_prediction.groupBy(label, "prediction").count().show()
            lr_prediction_quantile = lr_prediction.select(label, "prediction")
            training_summary = regressor.summary

            print("numof_Iterations...%d\n" % training_summary.totalIterations)
            print("ObjectiveHistory...%s\n" %
                  str(training_summary.objectiveHistory))
            print("RMSE...%f\n" % training_summary.rootMeanSquaredError)
            RMSE = training_summary.rootMeanSquaredError
            print("MSE....%f\n" % training_summary.meanSquaredError)
            MSE = training_summary.meanSquaredError
            print("r**2(r-square)....::%f\n" % training_summary.r2)
            r_square = training_summary.r2
            print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj)
            adjsted_r_square = training_summary.r2adj
            print("deviance residuals %s" %
                  str(training_summary.devianceResiduals))
            training_summary.residuals.show()
            residual_graph = training_summary.residuals
            residual_graph_pandas = residual_graph.toPandas()
            print("coefficient standard errors: \n" +
                  str(training_summary.coefficientStandardErrors))
            coefficientStdError = str(
                training_summary.coefficientStandardErrors)
            print(" Tvalues :\n" + str(training_summary.tValues))
            T_values = str(training_summary.tValues)
            tValuesList = training_summary.tValues
            print(" p values :\n" + str(training_summary.pValues))
            P_values = str(training_summary.pValues)
            coefficientList = list(regressor.coefficients)

            #summaryData
            import pyspark.sql.functions as F
            import builtins
            round = getattr(builtins, 'round')
            print(coefficientList)
            coefficientListRounded = []
            for value in coefficientList:
                coefficientListRounded.append(round(value, 4))
            # print(coefficientListRounded)
            # print(intercept_t)
            interceptRounded = round(float(intercept_t), 4)
            # print(interceptRounded)
            # print(RMSE)
            RMSERounded = round(RMSE, 4)
            # print(RMSERounded)
            MSERounded = round(MSE, 4)
            rSquareRounded = round(r_square, 4)
            adjustedrSquareRounded = round(adjsted_r_square, 4)
            coefficientStdError = training_summary.coefficientStandardErrors
            coefficientStdErrorRounded = []
            for value in coefficientStdError:
                coefficientStdErrorRounded.append(round(float(value), 4))
            print(coefficientStdErrorRounded)
            tValuesListRounded = []
            for value in tValuesList:
                tValuesListRounded.append(round(value, 4))
            print(tValuesListRounded)
            pValuesListRounded = []
            PValuesList = training_summary.pValues

            for value in PValuesList:
                pValuesListRounded.append(round(value, 4))
            print(pValuesListRounded)

            # regression equation
            intercept_t = float(intercept_t)
            coefficientList = list(regressor.coefficients)
            equation = label, '=', interceptRounded, '+'
            for feature, coeff in zip(feature_colm, coefficientListRounded):
                coeffFeature = coeff, '*', feature, '+'
                equation += coeffFeature
            equation = equation[:-1]
            print(equation)
            equationAsList = list(equation)
            '''# statTable function
            def summaryTable(self,featuresName,featuresStat):
                statTable={}
                for name, stat in zip(featuresName.values(),
                                      featuresStat.values()):
                    print(name, ": ", stat)
                    statTable[name]=stat
                return statTable
            '''

            # significance value

            PValuesList = training_summary.pValues
            significanceObject = {}

            for pValue in pValuesListRounded:
                if (0 <= pValue < 0.001):
                    significanceObject[pValue] = '***'
                if (0.001 <= pValue < 0.01):
                    significanceObject[pValue] = '**'
                if (0.01 <= pValue < 0.05):
                    significanceObject[pValue] = '*'
                if (0.05 <= pValue < 0.1):
                    significanceObject[pValue] = '.'
                if (0.1 <= pValue < 1):
                    significanceObject[pValue] = '-'
            print(significanceObject)

            # storing test predicted value to the dataset

            predictionData = 'prediction.parquet'

            predictionDataStoring = locationAddress + userId + predictionData
            testDataPrediction.write.parquet(predictionDataStoring,
                                             mode='overwrite')

            # residual  vs predicted value

            prediction_data = regressor.summary.predictions
            prediction_data.show()
            prediction_data.select(['prediction']).show()
            predicted = prediction_data.select(['prediction'])
            regressor.summary.residuals.show()
            residuals = regressor.summary.residuals
            pred_d = predicted.withColumn('row_index',
                                          f.monotonically_increasing_id())
            res_d = residuals.withColumn('row_index',
                                         f.monotonically_increasing_id())

            pred_residuals = pred_d.join(
                res_d, on=['row_index']).sort('row_index').drop('row_index')
            pred_residuals.show()

            QQPlot = 'QQPlot.parquet'
            # locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/'

            # userId = '6786103f-b49b-42f2-ba40-aa8168b65e67'

            QQPlotAddress = locationAddress + userId + QQPlot
            pred_residuals.write.parquet(QQPlotAddress, mode='overwrite')

            # pred_residuals.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet',
            #                              mode='overwrite')

            #################################################################################3
            # scale location plot
            from pyspark.sql.functions import abs as ab, sqrt, mean as meann, stddev as stdDev

            df_label = prediction_data.select(
                label, 'prediction',
                sqrt(ab(prediction_data[label])).alias("sqrt_label"))

            df_label.show()
            df_sqrt_label_index = df_label.withColumn(
                'row_index', f.monotonically_increasing_id())
            df_sqrt_label_index.show()
            res_d.show()
            sqrt_label_residual_join = df_sqrt_label_index.join(
                res_d, on=['row_index']).sort('row_index').drop('row_index')
            sqrt_label_residual_join.show()
            std_resid = sqrt_label_residual_join.select(
                'sqrt_label', 'prediction',
                (sqrt_label_residual_join['residuals'] /
                 sqrt_label_residual_join['sqrt_label']).alias('std_res'))
            std_resid.show()
            sqrt_std_res = std_resid.select(
                "std_res", 'prediction',
                sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid"))
            sqrt_std_res.show()
            sqrt_std_res_fitted = sqrt_std_res.select('prediction',
                                                      'sqrt_std_resid')

            scaleLocationPlot = 'scaleLocation.parquet'

            scaleLocationPlotAddress = locationAddress + userId + scaleLocationPlot
            sqrt_std_res_fitted.write.parquet(scaleLocationPlotAddress,
                                              mode='overwrite')

            # sqrt_std_res_fitted.write.parquet(
            #     'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet',
            #     mode='overwrite')
            ###########
            #QQplot
            # QUANTILE

            from scipy.stats import norm
            import statistics
            import math

            res_d.show()
            sorted_res = res_d.sort('residuals')
            sorted_res.show()
            # stdev_ress = sorted_res.select(stdDev(col('residuals')).alias('std_dev'),
            #                                meann(col('residuals')).alias('mean'))
            # stdev_ress.show()
            # mean_residual = stdev_ress.select(['mean']).toPandas()
            # l = mean_residual.values.tolist()
            # print(l)
            # stddev_residual = stdev_ress.select(['std_dev']).toPandas()
            # length of the sorted std residuals
            count = sorted_res.groupBy().count().toPandas()
            countList = count.values.tolist()
            tuple1 = ()
            for k in countList:
                tuple1 = k
            for tu in tuple1:
                lengthResiduals = tu
            print(lengthResiduals)
            quantileList = []
            for x in range(0, lengthResiduals):
                quantileList.append((x - 0.5) / (lengthResiduals))

            print(quantileList)

            # Z-score on theoritical quantile

            zTheoriticalTrain = []
            for x in quantileList:
                zTheoriticalTrain.append(norm.ppf(abs(x)))
            print(zTheoriticalTrain)

            sortedResidualPDF = sorted_res.select('residuals').toPandas()
            sortedResidualPDF = sortedResidualPDF['residuals']
            stdevResidualTrain = statistics.stdev(sortedResidualPDF)
            meanResidualTrain = statistics.mean(sortedResidualPDF)

            zPracticalTrain = []
            for x in sortedResidualPDF:
                zPracticalTrain.append(
                    (x - meanResidualTrain) / stdevResidualTrain)

            ##########
            target = dataset.select(label)
            pred = prediction_data.select(['prediction'])
            pred_d = pred.withColumn('row_index',
                                     f.monotonically_increasing_id())
            target_d = target.withColumn('row_index',
                                         f.monotonically_increasing_id())

            pred_target = pred_d.join(target_d,
                                      on=['row_index']).drop('row_index')
            pred_target.show()

            dataset.show()

            pred_target_data_update = dataset.join(pred_target, on=[label])

            pred_target_data_update.show(100)

            ##########3
            # table_response = {
            #
            #     "Intercept": intercept_t,
            #     "Coefficients": coefficient_t,
            #     "RMSE": RMSE,
            #     "MSE": MSE,
            #     "R_square": r_square,
            #     "Adj_R_square": adjsted_r_square,
            #     "coefficientStdError": coefficientStdError,
            #     "T_value": T_values,
            #     "P_value": P_values
            #
            # }
            y = 0.1
            x = []

            for i in range(0, 90):
                x.append(y)
                y = round(y + 0.01, 2)
            quantile_label = lr_prediction_quantile.approxQuantile(
                label, x, 0.01)
            quantile_prediction = lr_prediction_quantile.approxQuantile(
                "prediction", x, 0.01)
            Q_label_pred = ''
            print(len(quantile_label))
            length = len(quantile_label)

            for i in range(0, len(quantile_label)):
                Q_label_pred += str(quantile_label[i]) + 't' + str(
                    quantile_prediction[i]) + 'n'
            import math

            fitted_residual = ''
            print(len(prediction_val_pand_residual))
            length = len(prediction_val_pand_residual)

            for i in range(0, len(prediction_val_pand_residual)):
                fitted_residual += str(
                    prediction_val_pand_predict[i]) + 't' + str(
                        prediction_val_pand_residual[i]) + 'n'
            ## scale location graph data

            prediction_val_pand_residual
            prediction_val_pand_predict
            prediction_val_pand_residual_abs = prediction_val_pand_residual.abs(
            )
            import math
            sqrt_residual = []
            for x in prediction_val_pand_residual_abs:
                sqrt_residual.append(math.sqrt(x))
                # print ("____________________  ",x)

            sqrt_residual
            # calculating std deviation
            import statistics

            print(statistics.stdev(prediction_val_pand_residual))
            stdev_ = statistics.stdev(prediction_val_pand_residual)

            # calcuate stnd residuals
            std_res = []
            for x in prediction_val_pand_residual:
                std_res.append(x / stdev_)
            print(std_res)

            # calculating the square root of std_res
            import math
            sqr_std_res = []
            for x in std_res:
                sqr_std_res.append(math.sqrt(abs(x)))
            print(sqr_std_res)

            scale_predict_residual = ''
            for pre, res in zip(prediction_val_pand_predict, sqr_std_res):
                scale_predict_residual += str(pre) + 't' + str(res) + 'n'
            print(scale_predict_residual)
            # QUANTILE

            y = 0.1
            x = []

            for i in range(0, 90):
                x.append(y)
                y = round(y + 0.01, 2)

            quantile_std_res = spark.createDataFrame(std_res, FloatType())
            quantile_std_res.show()
            quantile_std_res_t = quantile_std_res.approxQuantile(
                'value', x, 0.01)
            print(quantile_std_res_t)
            print(x)
            # calculating the z_score
            from scipy.stats import norm

            ## sort the list
            sorted_std_res = sorted(std_res)

            mean = statistics.mean(sorted_std_res)
            stdev = statistics.stdev(sorted_std_res)
            # print(mean)
            quantile = []
            n = len(std_res)
            print(n)
            for x in range(0, n):
                quantile.append((x - 0.5) / (n))

            print(quantile)
            # z_score theoratical
            z_theory = []
            for x in quantile:
                z_theory.append(norm.ppf(abs(x)))
            # z score for real val
            z_pract = []
            for x in sorted_std_res:
                z_pract.append((x - mean) / stdev)
            Q_label_pred = ''
            for quant, val in zip(z_theory, z_pract):
                Q_label_pred += str(quant) + 't' + str(val) + 'n'
            graph_response = {
                "Q_Q_plot": Q_label_pred,
                "residual_fitted": fitted_residual,
                "scale_location": scale_predict_residual
            }

            tableContent = \
                {
                    'coefficientValuesKey': coefficientListRounded,
                    'tValuesKey': tValuesListRounded,
                    'pValuesKey': pValuesListRounded,
                    'significanceValuesKey': significanceObject,
                    'interceptValuesKey': interceptRounded,
                    "RMSE": RMSERounded,
                    "RSquare": rSquareRounded,
                    "AdjRSquare": adjustedrSquareRounded,
                    "CoefficientStdError": coefficientStdErrorRounded,
                    'equationKey': equation
                }

            json_response = {
                'table_data': tableContent,
                'graph_data': graph_response
            }
            print(json_response)
            return (json_response)
        except Exception as e:
            print('exception is =' + str(e))
Beispiel #13
0
df_invoices_temp1 = df_inner_joined.withColumnRenamed('price',
                                                      'price_per_unit')

df_invoices_temp1.show(2)

df_invoices_temp2 = df_invoices_temp1.withColumn\
                    ('total_price', df_invoices_temp1.price_per_unit * df_invoices_temp1.quantity_ordered)

df_invoices_temp2.select('customer_name', 'product_name', 'price_per_unit',
                         'quantity_ordered', 'total_price').show(2)

from pyspark.sql.functions import sqrt, pow

df_invoices_temp3 = df_invoices_temp2.withColumn\
                    ('shipping_distance', sqrt(pow(df_invoices_temp2.geolocation_x - \
                                                       df_invoices_temp2.warehouse_x,2) + \
                                               pow(df_invoices_temp2.geolocation_y - \
                                                     df_invoices_temp2.warehouse_y,2)))

df_invoices_temp3.select('customer_name', 'geolocation_x', 'geolocation_y',
                         'shipping_distance').show(2)

df_invoices_temp4 = df_invoices_temp3.withColumn\
                    ('shipping_costs', df_invoices_temp3.shipping_distance * 10)

df_invoices_temp4.select('customer_name', 'geolocation_x', 'geolocation_y',
                         'shipping_distance', 'shipping_costs').show(2)

df_invoices_final = df_invoices_temp4
df_invoices_final.cache()

df_invoices_final.select('customer_id', 'customer_name', 'address',
    when(
        substring(trim(df.state), 1, 10).contains('n') == False,
        df["position"].getItem(1)).otherwise(0))

# Recherche des values (état, position x y) de la ligne suivant ; ajout dans la ligne actuelle
df = df.withColumn("nxt_state", lead(df.state).over(my_window))
df = df.withColumn("nxt_x", lead(df.x).over(my_window))
df = df.withColumn("nxt_y", lead(df.y).over(my_window))

# Reconnaissance des frames qui correspondant aux buts selon les critères suivantes:
# 1. Un ball est détecté das le frame actuel mais disparaît dans le frame suivant
# 2. La position du ball est dans la zone porte de babyfoot (Cette zone n'est pas très precise pour l'instant)
df = df.withColumn(
    "goal",
    when((substring(trim(df.state), 1, 10).contains('n') == False) &
         (substring(trim(df.nxt_state), 1, 10).contains('n')) & (df.y > 150) &
         (df.y < 300) & ((df.x < 100) | (df.x > 500)), 1).otherwise(0))
df = df.withColumn("goal", lead(df.goal).over(my_window))
df.groupBy('goal').count().show()
df = df.filter(df.goal.isNotNull())

# Calcul de vitesse selon la distance entre des coordonnées et une coeifficent de transformation (Une coeifficent expérimentale est utilisée pour le test )
df = df.withColumn("vitesse",
                   sqrt(pow(df.x - df.nxt_x, 2) + pow(df.y - df.nxt_y, 2)))
df = df.withColumn("vitesse", round(df["vitesse"] * 30 / 500, 2))
df = df.withColumn("total_goal", sum(df.goal).over(my_window))
df.show()

# Enregistrement dans un nouveau fichier json
df.write.json("file:///home/ymo/babyfoot.json", mode='overwrite')
exit()
                          	              (col('Start_Longitude') > -80) &\
                                 	      (col('Start_Longitude') < -70) &\
                                	      (col('Start_Latitude') > 40) &\
                               		      (col('Start_Latitude') < 46) &\
                             		      (col('End_Longitude') > -80) &\
                             		      (col('End_Longitude') < -70) &\
                             		      (col('End_Latitude') > 40) &\
                               		      (col('End_Latitude') < 46) &\
                                              (col('Cost') > 0))

yellow_tripdata_1m = yellow_tripdata_1m.withColumn("Duration", ((unix_timestamp(col("End_Datetime")) - unix_timestamp(col("Start_Datetime")))/60))\
		                       .withColumn("Diff_Longitude", col("End_Longitude") - col("Start_Longitude"))\
				       .withColumn("Diff_Latitude", col("End_Latitude") - col("Start_Latitude"))\
				       .withColumn("a", F.pow(F.sin(col("Diff_Latitude")/2),2) +\
				                        F.cos(col("Start_Latitude"))*F.cos(col("End_Latitude"))*F.pow(F.sin(col("Diff_Longitude")/2),2))\
				       .withColumn("Distance", 2 * 6371 * F.atan2(F.sqrt(col("a")), F.sqrt(1.0 - col("a"))))\
				       .drop("Diff_Longitude").drop("Diff_Latitude").drop("Start_Datetime")\
				       .drop("End_Datetime").drop("Start_Longitude").drop("Start_Latitude")\
				       .drop("End_Longitude").drop("End_Latitude").drop("a").drop("Cost")

yellow_trip_joined = yellow_tripdata_1m.join(yellow_tripvendors_1m, "ID", "inner").drop("ID")
yellow_trip_joined.createOrReplaceTempView("yellow_trip_joined")

window = Window.partitionBy("Vendor")
res = yellow_trip_joined.withColumn("Max_Distance", F.max("Distance").over(window))\
                        .where(col("Distance") == col("Max_Distance"))\
                        .drop("Max_Distance").select(["Vendor", "Distance", "Duration"])
   
res.show() 
print("Time of Q2 using SQL with parquet is: %s seconds" % (time.time() - start_time_parquet)) 
Beispiel #16
0
                   "dupeCount = 1").drop('dupeCount')

# 2
# get the POI list
POI_df = sqlContext.read.format('csv').options(
    header='true', inferschema='true').load('../../tmp/data/POIList.csv')

# cross join dataframes to have reference to POI longitude and latitude in the same row for calculation
df_2 = df.crossJoin(POI_df.select('*'))

# belated realizing that calculating geographical distance is supposed to be done with Haversine formula
# I used the formula for distance between points: sqrt((x2-x1)^2 + (y2-y1)^2)
# Using the Haversine formula, I would have to create a UDF which takes in both geographical points and returns the distance
df_2 = df_2.withColumn(
    'dist',
    f.sqrt(((df_2["lat"] - df_2[" Latitude"])**2) +
           ((df_2["long"] - df_2["Longitude"])**2)))

# get the minimum distance for each unique request, then join with the original table to get the closest POIID
df_3 = df_2.groupby(['time', 'lat', 'long']).min('dist')
df_3 = df_3.join(df_2, (df_3['time'] == df_2['time']) &
                 (df_3['lat'] == df_2['lat']) & (df_3['long'] == df_2['long'])
                 & (df_3['min(dist)'] == df_2['dist'])).drop(df_2['lat']).drop(
                     df_2['long']).drop(df_2['time'])
df_3 = df_3.drop(df_3[' Latitude']).drop(df_3['Longitude'])

# 3
# aggregation for average and standard deviation
df_5 = df_3.groupby('POIID').agg(f.avg('dist'), f.stddev('dist'))

# using max(dist) as the radius - the furthest point from the POI should be the radius to use to draw the circle
df_6 = df_3.groupby('POIID').agg(f.max('dist'), f.count('dist'))
    def train(self,
              df_train,
              top_N=None,
              user_column_name="user",
              item_column_name="item",
              rating_column_name="rating"):
        """
        Calculate cosine similarities between user pairs

        Parameters
        ----------
        df_train : DataFrame
            Ratings of items in the following format: [user, item, rating]
        top_N : int or None
            Number of top similarities for a given user pair that will compose
            a similarity matrix. It is used in the train phase
        rating_column_name : str
        user_column_name : str
        item_column_name : str
        """
        top_N = int(top_N) if top_N else self.top_N_similarities
        user_column_name = str(user_column_name)
        item_column_name = str(item_column_name)
        rating_column_name = str(rating_column_name)

        clmn_names = [
            F.col(user_column_name).alias("user"),
            F.col(item_column_name).alias("item"),
            F.col(rating_column_name).alias("rating")
        ]

        df_train = df_train.select(clmn_names)

        left_clmn_names = [
            F.col("item").alias("p"),
            F.col("user").alias("u1"),
            F.col("rating").alias("v1")
        ]

        right_clmn_names = [
            F.col("item").alias("p"),
            F.col("user").alias("u2"),
            F.col("rating").alias("v2")
        ]

        # Step 1. Create matrix

        df_dot = df_train.select(left_clmn_names)\
            .join(df_train.select(right_clmn_names), on="p")\
            .where(F.col("u1") != F.col("u2"))\
            .groupBy([F.col("u1"), F.col("u2")])\
            .agg(F.sum(F.col("v1") * F.col("v2")).alias("dot"))

        # Step 2. Calculate norms

        df_norm = df_train.select(left_clmn_names)\
            .groupBy(F.col("u1"))\
            .agg(F.sqrt(F.sum(F.col("v1") * F.col("v1"))).alias("norm"))

        similarity_clmns = [
            F.col("u1"),
            F.col("u2"),
            (F.col("dot") / F.col("n1") / F.col("n2")).alias("sim")
        ]

        # Step 4. Calculate similarities

        df_similarity = df_dot.join(df_norm.select(F.col("u1"), F.col("norm").alias("n1")), on="u1")\
                    .join(df_norm.select(F.col("u1").alias("u2"), F.col("norm").alias("n2")), on="u2")\
                    .select(similarity_clmns)

        window = Window.partitionBy(F.col("u1"), F.col("u2"))

        # Step 5. Turncate similarities

        df_similarity_N = df_similarity.select("*", F.count("sim").over(window).alias("rank"))\
                    .filter(F.col("rank") <= top_N)

        # Step 6. Save data

        self.top_N_similarities = top_N
        self.df_similarity = df_similarity_N.select("u1", "u2",
                                                    "sim").persist()
        self.df_train = df_train.persist()

        # Forse persists by calling to count
        self.df_similarity.count()
        self.df_train.count()
Beispiel #18
0
    'x',
    F.cos(data['latitude'] * g2r) * F.cos(data['longitude'] * g2r))
data = data.withColumn(
    'y',
    F.cos(data['latitude'] * g2r) * F.sin(data['longitude'] * g2r))
data = data.withColumn('z', F.sin(data['latitude'] * g2r))

"c里面是几何中心,TODO需要改成patterns里面算出来的车窝"
c = data.groupBy('engine_serial_number').agg(
    F.mean('x').alias('x'),
    F.mean('y').alias('y'),
    F.mean('z').alias('z'))
c = c.withColumn('clon', F.atan(c['y'] / c['x']) / g2r + 180)
c = c.withColumn(
    'clat',
    F.atan(c['z'] / F.sqrt(c['x'] * c['x'] + c['y'] * c['y'])) / g2r)
c = c.drop('x', 'y', 'z')

#TODO 直接用车窝替代
#将里程规整到100米
data = data.drop('x', 'y', 'z')
data = data.withColumn('dis', (data["high_resolution_total_vehicle_distance"] /
                               100).cast(IntegerType()) * 100)
data = data.withColumn(
    'time',
    F.unix_timestamp('occurrence_date_time').cast(IntegerType()))

temp = data.groupBy('dis', 'engine_serial_number').agg(
    (F.max(data['time']) - F.min(data['time'])).alias('staydur'))

data = data.withColumn(
Beispiel #19
0
# XXX:
laplacian = sys.argv[1]

if laplacian == 'unnormalized':
    entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], x[1]))
    D = CoordinateMatrix(entries, numCols=N, numRows=N)
    L = D.toBlockMatrix().subtract(W.toBlockMatrix()).toCoordinateMatrix()
elif laplacian == 'normalized':
    entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], 1 / x[1]))
    D_inv = CoordinateMatrix(entries, numCols=N, numRows=N).toBlockMatrix()
    I = CoordinateMatrix(sc.range(N).map(lambda i: MatrixEntry(i, i, 1.0)),
                         numCols=N,
                         numRows=N).toBlockMatrix()
    L = I.subtract(D_inv.multiply(W.toBlockMatrix())).toCoordinateMatrix()
elif laplacian == 'symmetric':
    entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], 1 / sqrt(x[1])))
    D_invsq = CoordinateMatrix(entries, numCols=N, numRows=N).toBlockMatrix()
    I = sc.range(N).map(lambda i: MatrixEntry(i, i, 1.0), N, N)
    tmp = D_invsq.multiply(W.toBlockMatrix()).multiply(D_invsq)
    L = I.toBlockMatrix().subtract(tmp)
else:
    raise ValueError('Unknown type of Laplacian.')

## SVD, and transform from dense matrix to dataframe.
svd = L.toRowMatrix().computeSVD(k=K, computeU=False)
V = svd.V.toArray().tolist()
VV = spark.createDataFrame(V)
kmeans = KMeans().setK(K).setSeed(1)
vecAssembler = VectorAssembler(inputCols=VV.schema.names, outputCol='features')
VV = vecAssembler.transform(VV)
Beispiel #20
0
    def _fit(self, ratings_df):
        '''
        Fit ALS model using reviews as training data.

        Parameters
        ==========
        ratings_df      (pyspark.sql.DataFrame) Data used to train recommender
                        model. Columns are 'user', 'item', and 'rating'. Values
                        of user and item must be numeric. Values of rating
                        range from 1 to 5.

        Returns
        =======
        self
        '''
        # avg_rating_df = (
        #     ratings_df
        #     .groupBy()
        #     .avg(self.getRatingCol())
        #     .withColumnRenamed('avg({})'.format(self.getRatingCol()),
        #                        'avg_rating')
        # )

        # print('Fit starting!')

        start_time = time.monotonic()

        # print('ratings_df')
        # ratings_df.show()

        rating_stats_df = (
            ratings_df
            .agg(
                F.avg(self.getRatingCol()).alias('avg_rating'),
                F.stddev_samp(self.getRatingCol()).alias('stddev_rating')
            )
        )

        # print('ratings_stats_df:')
        # rating_stats_df.show()

        # if not self.getUseALS():
        #     self.setLambda_1(0.0)
        #     self.setLambda_2(0.0)

        item_bias_df = (
            ratings_df
            .crossJoin(rating_stats_df)
            .withColumn(
                'diffs_item_rating',
                F.col(self.getRatingCol()) - F.col('avg_rating')
            )
            .groupBy(self.getItemCol())
            .agg(
                F.avg(F.col('diffs_item_rating')).alias('avg_diffs_item_rating'),
                F.nanvl(
                    F.stddev_samp(F.col('diffs_item_rating')),
                    F.lit(2.147483647E9)
                ).alias('stddev_diffs_item_rating'),
                F.count("*").alias('count_item_rating')
            )
            .withColumn(
                'stderr_diffs_item_rating',
                (self.getLambda_1() + F.col('stddev_diffs_item_rating'))
                / F.sqrt('count_item_rating')
            )
            .withColumn(
                'item_bias',
                F.col('avg_diffs_item_rating')
                / (1 +  F.col('stderr_diffs_item_rating'))
            )
            .select(
                self.getItemCol(),
                'item_bias',
                'avg_diffs_item_rating',
                'stderr_diffs_item_rating',
                'stddev_diffs_item_rating',
                'count_item_rating'
            )
        )

        # print('item_bias_df:')
        # item_bias_df.show(5)

        # item_bias_df.printSchema()

        # print('item_bias_df NaN')
        # item_bias_df.where(F.isnan("item_bias")).show()

        user_bias_df = (
            ratings_df
            .crossJoin(rating_stats_df)
            .join(item_bias_df, on=self.getItemCol())
            .withColumn(
                'diffs_user_rating',
                F.col(self.getRatingCol()) - F.col('avg_rating') - F.col('item_bias')
            )
            .groupBy(self.getUserCol())
            .agg(
                F.avg(F.col('diffs_user_rating')).alias('avg_diffs_user_rating'),
                F.nanvl(
                    F.stddev_samp(F.col('diffs_user_rating')),
                    F.lit(2.147483647E9)
                ).alias('stddev_diffs_user_rating'),
                F.count("*").alias('count_user_rating')
            )
            .withColumn(
                'stderr_diffs_user_rating',
                (self.getLambda_2() + F.col('stddev_diffs_user_rating'))
                / F.sqrt('count_user_rating')
            )
            .withColumn(
                'user_bias',
                F.col('avg_diffs_user_rating')
                / (1 + F.col('stderr_diffs_user_rating'))
            )
            .select(
                self.getUserCol(),
                'user_bias',
                'avg_diffs_user_rating',
                'stderr_diffs_user_rating',
                'stddev_diffs_user_rating',
                'count_user_rating'
            )
        )

        # print('user_bias_df:')
        # user_bias_df.show(5)

        # print('user_bias_df NaN')
        # user_bias_df.where(F.isnan("user_bias")).show()

        if self.getUseALS():
            if self.getUseBias():
                residual_df = (
                    ratings_df
                    .crossJoin(rating_stats_df)
                    .join(user_bias_df, on=self.getUserCol())
                    .join(item_bias_df, on=self.getItemCol())
                    .withColumn(
                        self.getRatingCol(),
                        F.col(self.getRatingCol())
                        - F.col('avg_rating')
                        - F.col('user_bias')
                        - F.col('item_bias')
                    )
                    .select(
                        self.getUserCol(),
                        self.getItemCol(),
                        self.getRatingCol()
                    )
                )

            else:
                residual_df = ratings_df
                # self.setColdStartStrategy('drop')

            residual_stats_df = (
                residual_df
                .agg(
                    F.avg(F.col(self.getRatingCol())).alias('avg_residual'),
                    F.stddev(F.col(self.getRatingCol())).alias('stddev_residual')
                )
            )

            # print('residual_df')
            # residual_df.show()

            # print('residual_df NaN')
            # residual_df.where(F.isnan("rating")).show()

            # print('residual_stats_df')
            # residual_stats_df.show()

            als_model = ALS(
                rank=self.getRank(),
                maxIter=self.getMaxIter(),
                regParam=self.getRegParam(),
                numUserBlocks=self.getNumUserBlocks(),
                numItemBlocks=self.getNumItemBlocks(),
                implicitPrefs=self.getImplicitPrefs(),
                alpha=self.getAlpha(),
                userCol=self.getUserCol(),
                itemCol=self.getItemCol(),
                ratingCol=self.getRatingCol(),
                nonnegative=self.getNonnegative(),
                checkpointInterval=self.getCheckpointInterval(),
                intermediateStorageLevel=self.getIntermediateStorageLevel(),
                finalStorageLevel=self.getFinalStorageLevel()
            )

            recommender = als_model.fit(residual_df)

        else:
            recommender = None
            residual_stats_df = None

        print('Fit done in {} seconds'.format(time.monotonic() - start_time))

        return(
            RecommenderModel(
                self.getUseALS(), self.getUseBias(), self.getLambda_3(),
                # self.getColdStartStrategy(),
                recommender, rating_stats_df, residual_stats_df,
                user_bias_df, item_bias_df
            )
        )

# In[66]:

get_ipython().magic('matplotlib inline')


# In[67]:

import seaborn


# In[69]:

sentiment_pd = best_model.    transform(airportCleanDF).    groupby('airport_name').    agg(fn.avg('prediction').alias('prediction'), 
        (2*fn.stddev('prediction')/fn.sqrt(fn.count('*'))).alias('err')).\
    toPandas()


# In[ ]:




# In[ ]:




# In[70]:
for window in [1, 5, 10, 20, 80]:
    df = df.withColumn(
        "squared_error_Window_{}".format(window),
        pow((col("Close_Actual_Window_{}".format(window)) - col("close_ma")),
            2))
    df = df.withColumn(
        "s_abs_percentage_error_Window_{}".format(window),
        (abs(col("close_ma") - col("Close_Actual_Window_{}".format(window))) /
         ((col("Close_Actual_Window_{}".format(window)) + col("close_ma")) /
          2)) * 100)

df.show()

df = df.withColumn("rank",
                   percent_rank().over(Window.partitionBy().orderBy("date")))
train_data = df.where("rank <= .9").drop("rank")
test_data = df.where("rank > .9").drop("rank")

for window in [1, 5, 10, 20, 80]:
    total_rmse.append(
        test_data.select(
            sqrt(mean(col(
                "squared_error_Window_{}".format(window))))).collect())
    total_smape.append(
        test_data.select(
            mean(col(
                "s_abs_percentage_error_Window_{}".format(window)))).collect())

print(total_mape)
print(total_rmse)
print(total_smape)
def get_sd(col):
    return (func.sqrt(func.avg(col * col) - func.avg(col) * func.avg(col)))
def main():

    # Args
    args = parse_args()
    # args.in_ld_folder = 'input_data/ld_each_variant'
    # args.in_manifest = 'input_data/190625/ld_analysis_input.tsv'
    # args.in_top_loci = 'input_data/190625/toploci.parquet'
    # args.out = 'output/ld_w_crediblesets.parquet'
    # args.min_r2 = 0.5

    # Make spark session
    global spark
    spark = (pyspark.sql.SparkSession.builder.config("spark.master",
                                                     "local[*]").getOrCreate())
    print('Spark version: ', spark.version)

    #
    # Load data ---------------------------------------------------------------
    #

    # Load LD
    ld = (
        load_ld(args.in_ld_folder).withColumn(
            'index_variant_id',
            regexp_replace(col('index_variant_id'), ':', '_')).withColumn(
                'tag_variant_id',
                regexp_replace(col('tag_variant_id'), ':', '_'))
        # .limit(10000) # Debug
    )

    # Load manifest
    manifest = (load_manifest(args.in_manifest).withColumnRenamed(
        'variant_id', 'index_variant_id'))

    #
    # Weight correlations by study population ---------------------------------
    #

    # Join LD to manifest
    data = manifest.join(ld, on='index_variant_id')

    # Replace R fields
    for coln in ['R_AFR', 'R_AMR', 'R_EAS', 'R_EUR', 'R_SAS']:
        data = (
            data
            # Replace all R values == 1 with 0.9999995, otherwise we get error
            # This is reverted later by rounding to 6 dp
            .withColumn(coln,
                when(col(coln) == 1, 0.9999995).otherwise(col(coln))
            )
            # Fill nulls with 0
            .withColumn(coln,
                when(col(coln).isNull(), 0).otherwise(col(coln))
            )
        )

    # Fisher transform correlations to z-scores
    for coln in ['R_AFR', 'R_AMR', 'R_EAS', 'R_EUR', 'R_SAS']:
        data = data.withColumn(coln.replace('R_', 'Z_'), arctanh(col(coln)))

    # Compute weighted average across populations
    data = data.withColumn(
        'Z_overall',
        ((col('AFR_prop') * col('Z_AFR')) + (col('AMR_prop') * col('Z_AMR')) +
         (col('EAS_prop') * col('Z_EAS')) + (col('EUR_prop') * col('Z_EUR')) +
         (col('SAS_prop') * col('Z_SAS'))))

    # Inverse Fisher transform weigthed z-score back to correlation
    data = data.withColumn('R_overall', tanh(col('Z_overall')))

    # Round R_overall to 6 dp
    data = data.withColumn('R_overall', round6dp(col('R_overall')))

    # Convert R to R2
    data = data.withColumn('R2_overall', pow(col('R_overall'), 2))

    # Drop rows where R2 is null
    data = data.filter(col('R2_overall').isNotNull())

    # Filter based on overall R2
    data = data.filter(col('R2_overall') >= args.min_r2)

    # Drop unneeded columns
    data = data.drop(*[
        'Z_overall', 'R_overall', 'R_AFR', 'R_AMR', 'R_EAS', 'R_EUR', 'R_SAS',
        'Z_AFR', 'Z_AMR', 'Z_EAS', 'Z_EUR', 'Z_SAS', 'index_variant_id'
    ])

    # Denormalise variant IDs
    data = (data.withColumnRenamed('chrom', 'lead_chrom').withColumnRenamed(
        'pos', 'lead_pos').withColumnRenamed(
            'ref', 'lead_ref').withColumnRenamed('alt', 'lead_alt').withColumn(
                'tag_split', split(col('tag_variant_id'), '_')).withColumn(
                    'tag_chrom',
                    col('tag_split').getItem(0)).withColumn(
                        'tag_pos',
                        col('tag_split').getItem(1).cast('int')).withColumn(
                            'tag_ref',
                            col('tag_split').getItem(2)).withColumn(
                                'tag_alt',
                                col('tag_split').getItem(3)).drop(
                                    'tag_split', 'tag_variant_id'))

    #
    # Conduct credible set analysis using PICS adjustment ---------------------
    #
    ''' Probabilistic Identification of Causal SNPs (PICS) from Farh (2014):
            https://www.nature.com/articles/nature13835

        Adjusts the p-values for tag SNPs based on the p-value of the lead SNP
        and it's LD.
    '''

    # Empiric constant that can be adjusted to fit the curve, 6.4 recommended.
    k = 6.4

    # Load toploci
    toploci = spark.read.parquet(args.in_top_loci)

    # Join negative log pvalue from toploci onto data
    toploci = (toploci.withColumn(
        'neglog_p',
        -1 * (log10(col('pval_mantissa')) +
              col('pval_exponent'))).withColumnRenamed(
                  'chrom', 'lead_chrom').withColumnRenamed(
                      'pos', 'lead_pos').withColumnRenamed(
                          'ref', 'lead_ref').withColumnRenamed(
                              'alt',
                              'lead_alt').select('study_id', 'lead_chrom',
                                                 'lead_pos', 'lead_ref',
                                                 'lead_alt', 'neglog_p'))
    data = data.join(
        toploci,
        on=['study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt'])

    # Calculate PICS statistics
    data = (data.withColumn('pics_mu',
                            col('R2_overall') * col('neglog_p')).withColumn(
                                'pics_std',
                                sqrt(1 - sqrt(col('R2_overall'))**k) *
                                sqrt(col('neglog_p')) / 2).withColumn(
                                    'pics_relative_prob',
                                    when(col('pics_std') == 0,
                                         1.0).otherwise(
                                             norm_sf(col('pics_mu'),
                                                     col('pics_std'),
                                                     col('neglog_p')))))

    # Calculate the sum of the posterior probabilities at each locus
    pics_prob_sums = (data.groupby(
        'study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt').agg(
            sum('pics_relative_prob').alias('pics_relative_prob_sum')))

    # Merge back onto data
    data = data.join(
        pics_prob_sums,
        on=['study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt'])

    # Calculate posterior probability at each locus
    data = (data.withColumn(
        'pics_postprob',
        col('pics_relative_prob') / col('pics_relative_prob_sum')).drop(
            'pics_relative_prob_sum', 'neglog_p'))

    # Calculate cumulative sum per locus
    window_spec = (Window.partitionBy('study_id', 'lead_chrom', 'lead_pos',
                                      'lead_ref', 'lead_alt').orderBy(
                                          desc('pics_postprob')).rowsBetween(
                                              Window.unboundedPreceding,
                                              Window.currentRow))
    data = (data.withColumn('pics_postprob_cumsum',
                            sum('pics_postprob').over(window_spec)))

    # Label whether each row is in the 95 and 99% credible sets
    window_spec = (Window.partitionBy(
        'study_id', 'lead_chrom', 'lead_pos', 'lead_ref',
        'lead_alt').orderBy('pics_postprob_cumsum'))
    data = (data.withColumn(
        'pics_95perc_credset',
        when(lag('pics_postprob_cumsum', 1).over(window_spec) >= 0.95,
             False).otherwise(True)).withColumn(
                 'pics_99perc_credset',
                 when(
                     lag('pics_postprob_cumsum', 1).over(window_spec) >= 0.99,
                     False).otherwise(True)))

    #
    # Write output ------------------------------------------------------------
    #

    # Rename columns and format
    data = (data.withColumnRenamed(
        'AFR_prop', 'AFR_1000G_prop').withColumnRenamed(
            'AMR_prop', 'AMR_1000G_prop').withColumnRenamed(
                'EAS_prop', 'EAS_1000G_prop').withColumnRenamed(
                    'EUR_prop', 'EUR_1000G_prop').withColumnRenamed(
                        'SAS_prop', 'SAS_1000G_prop').withColumnRenamed(
                            'R2_overall', 'overall_r2').select(
                                'study_id', 'lead_chrom', 'lead_pos',
                                'lead_ref', 'lead_alt', 'tag_chrom', 'tag_pos',
                                'tag_ref', 'tag_alt', 'overall_r2', 'pics_mu',
                                'pics_postprob', 'pics_95perc_credset',
                                'pics_99perc_credset', 'AFR_1000G_prop',
                                'AMR_1000G_prop', 'EAS_1000G_prop',
                                'EUR_1000G_prop', 'SAS_1000G_prop'))

    # Save output
    (data.repartitionByRange('study_id', 'lead_chrom',
                             'lead_pos').write.parquet(args.out,
                                                       mode='overwrite'))

    return 0
Beispiel #25
0
lat1 = 0.9345569159727344
lon1 = -1.9806997123424743
lat2 = 0.7945023069213337
lon2 = -1.2839693364011688
lat3 = 0.7893221871547071
lon3 = -1.1036193160713015
dlat1 = lat1 - lat0
dlon1 = lon1 - lon0
dlat2 = lat2 - lat0
dlon2 = lon2 - lon0
dlat3 = lat3 - lat0
dlon3 = lon3 - lon0
a1 = F.sin(dlat1 / 2)**2 + F.cos(lat0) * F.cos(lat0) * F.sin(dlon1 / 2)**2
a2 = F.sin(dlat2 / 2)**2 + F.cos(lat0) * F.cos(lat0) * F.sin(dlon2 / 2)**2
a3 = F.sin(dlat3 / 2)**2 + F.cos(lat0) * F.cos(lat0) * F.sin(dlon3 / 2)**2
c1 = F.lit(2) * F.asin(F.sqrt(a1))
c2 = F.lit(2) * F.asin(F.sqrt(a2))
c3 = F.lit(2) * F.asin(F.sqrt(a3))
r = F.lit(6371)
dist1 = (c1 * r).alias('dist1')
dist2 = (c2 * r).alias('dist2')
dist3 = (c3 * r).alias('dist3')

distances = clean2.select("_ID", "TimeSt", "City", "Province", "Latitude",
                          "Longitude", dist1, dist2, dist3)
distances.registerTempTable("dist0")

# POI assignation and minimal distance to poi

query = """SELECT _ID,  TimeSt, City, Province, dist1, dist2, dist3,
    CASE WHEN (dist1 < dist2) AND (dist1 < dist3) THEN "POI1 - EDMONTON" 
    def lassoRegression(self, dataset_add, feature_colm, label_colm,
                        relation_list, relation, userId):
        try:
            dataset = spark.read.parquet(dataset_add)
            dataset.show()
            Rsqr_list = []
            Rsqr_regPara = {}
            print(self.xt)
            # print(data_add)

            label = ''
            for val in label_colm:
                label = val
            #ETL part
            Schema = dataset.schema
            stringFeatures = []
            numericalFeatures = []
            for x in Schema:
                if (str(x.dataType) == "StringType"
                        or str(x.dataType) == 'TimestampType'
                        or str(x.dataType) == 'DateType'
                        or str(x.dataType) == 'BooleanType'
                        or str(x.dataType) == 'BinaryType'):
                    for y in feature_colm:
                        if x.name == y:
                            dataset = dataset.withColumn(
                                y, dataset[y].cast(StringType()))
                            stringFeatures.append(x.name)
                else:
                    for y in feature_colm:
                        if x.name == y:
                            numericalFeatures.append(x.name)

            if relation == 'linear':
                dataset = dataset
            if relation == 'non_linear':
                dataset = Relationship(dataset, relation_list)

            categoryColmList = []
            categoryColmListFinal = []
            categoryColmListDict = {}
            countOfCategoricalColmList = []
            for value in stringFeatures:
                categoryColm = value
                listValue = value
                listValue = []
                categoryColm = dataset.groupby(value).count()
                countOfCategoricalColmList.append(categoryColm.count())
                categoryColmJson = categoryColm.toJSON()
                for row in categoryColmJson.collect():
                    categoryColmSummary = json.loads(row)
                    listValue.append(categoryColmSummary)
                categoryColmListDict[value] = listValue

            if not stringFeatures:
                maxCategories = 5
            else:
                maxCategories = max(countOfCategoricalColmList)
            for x in Schema:
                if (str(x.dataType) == "StringType" and x.name == label):
                    for labelkey in label_colm:
                        label_indexer = StringIndexer(
                            inputCol=label,
                            outputCol='indexed_' + label,
                            handleInvalid="skip").fit(dataset)
                        dataset = label_indexer.transform(dataset)
                        label = 'indexed_' + label
                else:
                    label = label
            indexed_features = []
            encodedFeatures = []
            for colm in stringFeatures:
                indexer = StringIndexer(inputCol=colm,
                                        outputCol='indexed_' + colm,
                                        handleInvalid="skip").fit(dataset)
                indexed_features.append('indexed_' + colm)
                dataset = indexer.transform(dataset)
            featureAssembler = VectorAssembler(inputCols=indexed_features +
                                               numericalFeatures,
                                               outputCol='features',
                                               handleInvalid="skip")
            dataset = featureAssembler.transform(dataset)
            vectorIndexer = VectorIndexer(inputCol='features',
                                          outputCol='vectorIndexedFeatures',
                                          maxCategories=maxCategories,
                                          handleInvalid="skip").fit(dataset)
            dataset = vectorIndexer.transform(dataset)
            trainDataRatioTransformed = self.trainDataRatio
            testDataRatio = 1 - trainDataRatioTransformed
            train_data, test_data = dataset.randomSplit(
                [trainDataRatioTransformed, testDataRatio], seed=40)

            ######################################################################33
            # lasso final
            for t in self.xt:
                lr1 = LinearRegression(featuresCol="vectorIndexedFeatures",
                                       labelCol=label,
                                       elasticNetParam=1,
                                       regParam=t)
                regressor1 = lr1.fit(train_data)
                print(t)
                print("coefficient : " + str(regressor1.coefficients))
                reg_sum = regressor1.summary
                r2 = reg_sum.r2
                Rsqr_list.append(r2)
                Rsqr_regPara[r2] = t
                print(r2)

            print(Rsqr_list)
            print(max(Rsqr_list))
            maximum_rsqr = max(Rsqr_list)
            print(Rsqr_regPara)
            final_regPara = []

            for key, val in Rsqr_regPara.items():
                if (key == maximum_rsqr):
                    print(val)
                    final_regPara.append(val)

            for reg in final_regPara:
                lr_lasso = LinearRegression(
                    featuresCol="vectorIndexedFeatures",
                    labelCol=label,
                    elasticNetParam=1,
                    regParam=reg)
                regressor = lr_lasso.fit(train_data)
                training_summary = regressor.summary
                r2 = training_summary.r2
                print(r2)

            print("coefficient : " + str(regressor.coefficients))
            coefficient_t = str(regressor.coefficients)
            print("intercept : " + str(regressor.intercept))
            intercept_t = str(regressor.intercept)
            prediction = regressor.evaluate(test_data)
            prediction_val = prediction.predictions
            prediction_val.show()
            prediction_val_pand = prediction_val.select(
                label, "prediction").toPandas()
            prediction_val_pand = prediction_val_pand.assign(
                residual_vall=prediction_val_pand[label] -
                prediction_val_pand["prediction"])

            prediction_val_pand_residual = prediction_val_pand["residual_vall"]
            prediction_val_pand_label = prediction_val_pand[label]
            prediction_val_pand_predict = prediction_val_pand["prediction"]
            lr_prediction = regressor.transform(test_data)
            lr_prediction.groupBy(label, "prediction").count().show()
            lr_prediction_quantile = lr_prediction.select(label, "prediction")
            lr_prediction_onlypred = lr_prediction.select('prediction')
            # lr_prediction_quantile.show()

            # training_summary = regressor.summary

            print("numof_Iterations...%d\n" % training_summary.totalIterations)
            print("ObjectiveHistory...%s\n" %
                  str(training_summary.objectiveHistory))
            print("RMSE...%f\n" % training_summary.rootMeanSquaredError)
            RMSE = training_summary.rootMeanSquaredError
            print("MSE....%f\n" % training_summary.meanSquaredError)
            MSE = training_summary.meanSquaredError
            print("r**2(r-square)....::%f\n" % training_summary.r2)
            r_square = training_summary.r2
            print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj)
            adjsted_r_square = training_summary.r2adj
            print("deviance residuals %s" %
                  str(training_summary.devianceResiduals))
            training_summary.residuals.show()
            # residual_graph = training_summary.residuals
            # test = (residual_graph, lr_prediction_onlypred)
            # residual_graph.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode='append' )
            # print(test)
            # test.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode= 'append')
            # residual_graph_pandas = residual_graph.toPandas()
            # print("coefficient standard errors: \n" + str(training_summary.coefficientStandardErrors))
            # coefficient_error = str(training_summary.coefficientStandardErrors)
            # print(" Tvalues :\n" + str(training_summary.tValues))
            # T_values = str(training_summary.tValues)
            # print(" p values :\n" + str(training_summary.pValues))
            # P_values = str(training_summary.pValues)

            #######################################################################################################
            table_response = {
                "Intercept": intercept_t,
                "Coefficients": coefficient_t,
                "RMSE": RMSE,
                "MSE": MSE,
                "R_square": r_square,
                "Adj_R_square": adjsted_r_square
            }
            #######################################################################################################
            # residual  vs predicted value

            prediction_data = regressor.summary.predictions
            prediction_data.show()
            prediction_data.select(['prediction']).show()
            predicted = prediction_data.select(['prediction'])
            regressor.summary.residuals.show()
            residuals = regressor.summary.residuals
            pred_d = predicted.withColumn('row_index',
                                          f.monotonically_increasing_id())
            res_d = residuals.withColumn('row_index',
                                         f.monotonically_increasing_id())

            pred_residuals = pred_d.join(
                res_d, on=['row_index']).sort('row_index').drop('row_index')
            pred_residuals.show()

            QQPlot = 'QQPlot.parquet'
            locationAddress = 'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/'

            # userId = '6786103f-b49b-42f2-ba40-aa8168b65e67'

            QQPlotAddress = locationAddress + userId + QQPlot
            pred_residuals.write.parquet(QQPlotAddress, mode='overwrite')

            # pred_residuals.write.parquet('hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/Q_Q_PLOT.parquet',
            #                              mode='overwrite')

            #################################################################################3
            # scale location plot
            from pyspark.sql.functions import abs as ab, sqrt, mean as meann, stddev as stdDev

            df_label = prediction_data.select(
                label, 'prediction',
                sqrt(ab(prediction_data[label])).alias("sqrt_label"))

            df_label.show()
            df_sqrt_label_index = df_label.withColumn(
                'row_index', f.monotonically_increasing_id())
            df_sqrt_label_index.show()
            res_d.show()
            sqrt_label_residual_join = df_sqrt_label_index.join(
                res_d, on=['row_index']).sort('row_index').drop('row_index')
            sqrt_label_residual_join.show()
            std_resid = sqrt_label_residual_join.select(
                'sqrt_label', 'prediction',
                (sqrt_label_residual_join['residuals'] /
                 sqrt_label_residual_join['sqrt_label']).alias('std_res'))
            std_resid.show()
            sqrt_std_res = std_resid.select(
                "std_res", 'prediction',
                sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid"))
            sqrt_std_res.show()
            sqrt_std_res_fitted = sqrt_std_res.select('prediction',
                                                      'sqrt_std_resid')

            scaleLocationPlot = 'scaleLocation.parquet'

            scaleLocationPlotAddress = locationAddress + userId + scaleLocationPlot
            sqrt_std_res_fitted.write.parquet(scaleLocationPlotAddress,
                                              mode='overwrite')

            # sqrt_std_res_fitted.write.parquet(
            #     'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet',
            #     mode='overwrite')
            ###########
            #QQplot
            # QUANTILE

            from scipy.stats import norm
            import statistics
            import math

            res_d.show()
            sorted_res = res_d.sort('residuals')
            sorted_res.show()
            # stdev_ress = sorted_res.select(stdDev(col('residuals')).alias('std_dev'),
            #                                meann(col('residuals')).alias('mean'))
            # stdev_ress.show()
            # mean_residual = stdev_ress.select(['mean']).toPandas()
            # l = mean_residual.values.tolist()
            # print(l)
            # stddev_residual = stdev_ress.select(['std_dev']).toPandas()
            # length of the sorted std residuals
            count = sorted_res.groupBy().count().toPandas()
            countList = count.values.tolist()
            tuple1 = ()
            for k in countList:
                tuple1 = k
            for tu in tuple1:
                lengthResiduals = tu
            print(lengthResiduals)
            quantileList = []
            for x in range(0, lengthResiduals):
                quantileList.append((x - 0.5) / (lengthResiduals))

            print(quantileList)

            # Z-score on theoritical quantile

            zTheoriticalTrain = []
            for x in quantileList:
                zTheoriticalTrain.append(norm.ppf(abs(x)))
            print(zTheoriticalTrain)

            sortedResidualPDF = sorted_res.select('residuals').toPandas()
            sortedResidualPDF = sortedResidualPDF['residuals']
            stdevResidualTrain = statistics.stdev(sortedResidualPDF)
            meanResidualTrain = statistics.mean(sortedResidualPDF)

            zPracticalTrain = []
            for x in sortedResidualPDF:
                zPracticalTrain.append(
                    (x - meanResidualTrain) / stdevResidualTrain)

            ##########
            target = dataset.select(label)
            pred = prediction_data.select(['prediction'])
            pred_d = pred.withColumn('row_index',
                                     f.monotonically_increasing_id())
            target_d = target.withColumn('row_index',
                                         f.monotonically_increasing_id())

            pred_target = pred_d.join(target_d,
                                      on=['row_index']).drop('row_index')
            pred_target.show()

            dataset.show()

            pred_target_data_update = dataset.join(pred_target, on=[label])

            pred_target_data_update.show(100)

            ##########################################################################################

            # scale location plot

            # for scale location plot
            # from pyspark.sql.functions import udf
            #
            # def std_res(x):
            #     res_list = []
            #     res_list.append(x)
            #
            # std_residuals = udf(lambda y: std_res(y), FloatType())
            #
            # residuals_std = residuals.withColumn('residuals', std_residuals(col('residuals').cast(FloatType())))
            #
            # import statistics
            # import numpy as np
            # residuals_panda = residuals.toPandas()
            # # residuals_panda.residuals = range(residuals_panda.shape[1])
            # residuals_panda = residuals_panda.values
            # print(residuals_panda)
            # stdev_training = statistics.stdev(residuals_panda)
            # print(stdev_training)

            ############################################################################################################

            # creating the dictionary for storing the result

            # json_response = coefficient_t

            # print(json_response)

            # json_response = {"adjusted r**2 value" : training_summary.r2adj}

            # DATA VISUALIZATION PART

            # finding the quantile in the dataset(Q_Q plot)
            import matplotlib.pyplot as plt

            y = 0.1
            x = []

            for i in range(0, 90):
                x.append(y)
                y = round(y + 0.01, 2)
            quantile_label = lr_prediction_quantile.approxQuantile(
                label, x, 0.01)
            quantile_prediction = lr_prediction_quantile.approxQuantile(
                "prediction", x, 0.01)
            Q_label_pred = ''
            print(len(quantile_label))
            length = len(quantile_label)

            for i in range(0, len(quantile_label)):
                Q_label_pred += str(quantile_label[i]) + 't' + str(
                    quantile_prediction[i]) + 'n'
            import math

            fitted_residual = ''
            print(len(prediction_val_pand_residual))
            length = len(prediction_val_pand_residual)

            for i in range(0, len(prediction_val_pand_residual)):
                fitted_residual += str(
                    prediction_val_pand_predict[i]) + 't' + str(
                        prediction_val_pand_residual[i]) + 'n'
            ## scale location graph data

            prediction_val_pand_residual
            prediction_val_pand_predict
            prediction_val_pand_residual_abs = prediction_val_pand_residual.abs(
            )
            import math
            sqrt_residual = []
            for x in prediction_val_pand_residual_abs:
                sqrt_residual.append(math.sqrt(x))
                # print ("____________________  ",x)

            sqrt_residual
            # calculating std deviation
            import statistics

            print(statistics.stdev(prediction_val_pand_residual))
            stdev_ = statistics.stdev(prediction_val_pand_residual)

            # calcuate stnd residuals
            std_res = []
            for x in prediction_val_pand_residual:
                std_res.append(x / stdev_)
            print(std_res)

            # calculating the square root of std_res
            import math
            sqr_std_res = []
            for x in std_res:
                sqr_std_res.append(math.sqrt(abs(x)))
            print(sqr_std_res)

            scale_predict_residual = ''
            for pre, res in zip(prediction_val_pand_predict, sqr_std_res):
                scale_predict_residual += str(pre) + 't' + str(res) + 'n'
            print(scale_predict_residual)
            # QUANTILE

            y = 0.1
            x = []

            for i in range(0, 90):
                x.append(y)
                y = round(y + 0.01, 2)

            quantile_std_res = spark.createDataFrame(std_res, FloatType())
            quantile_std_res.show()
            quantile_std_res_t = quantile_std_res.approxQuantile(
                'value', x, 0.01)
            print(quantile_std_res_t)
            print(x)
            # calculating the z_score
            from scipy.stats import norm

            ## sort the list
            sorted_std_res = sorted(std_res)

            mean = statistics.mean(sorted_std_res)
            stdev = statistics.stdev(sorted_std_res)
            # print(mean)
            quantile = []
            n = len(std_res)
            print(n)
            for x in range(0, n):
                quantile.append((x - 0.5) / (n))

            print(quantile)
            # z_score theoratical
            z_theory = []
            for x in quantile:
                z_theory.append(norm.ppf(abs(x)))
            # z score for real val
            z_pract = []
            for x in sorted_std_res:
                z_pract.append((x - mean) / stdev)
            Q_label_pred = ''
            for quant, val in zip(z_theory, z_pract):
                Q_label_pred += str(quant) + 't' + str(val) + 'n'
            graph_response = {
                "Q_Q_plot": Q_label_pred,
                "residual_fitted": fitted_residual,
                "scale_location": scale_predict_residual
            }

            json_response = {
                'table_data': table_response,
                'graph_data': graph_response
            }

            return json_response

        except Exception as e:
            print('exception is =' + str(e))
Beispiel #27
0
    def ridge(self, dataset_add, feature_colm, label_colm, relation_list,
              relation, userId):
        try:
            dataset = spark.read.csv(dataset_add,
                                     header=True,
                                     inferSchema=True)
            dataset.show()
            Rsqr_list = []
            Rsqr_regPara = {}
            print(self.xt)
            # print(data_add)
            # data = spark.read.csv('/home/fidel/mltest/BI.csv', header=True, inferSchema=True)
            # data.show()
            # f_data = data.select('Sub Total', 'Tax Amount', 'Freight', 'Profit')
            # f_data.show()

            # class A():
            #     def __init__(self, feature='sahil', label='fcuk'):
            #         self.feature = feature
            #         # feature = 'sahil'
            #         self.label = label
            #         # self.test
            #         self.name = 'bro'
            #
            #     def linear_c(self):
            #         print(self.feature, '\n', self.label)
            #         print(self.name)
            #
            # # a = A(feature='test', label='f_t')
            # A(feature='test', label='f_t').linear_c()

            # renaming the colm
            # print(label_colm)
            # dataset.withColumnRenamed(label_colm, "label")
            # print(label_colm)
            # dataset.show()

            label = ''
            for y in label_colm:
                label = y

            print(label)

            # relationship

            if relation == 'linear':
                print('linear relationship')
            if relation == 'non_linear':
                dataset = Relationship(dataset, relation_list)

            dataset.show()

            # implementing the vector assembler

            featureassembler = VectorAssembler(
                inputCols=feature_colm, outputCol="Independent_features")

            output = featureassembler.transform(dataset)

            output.show()
            output.select("Independent_features").show()

            finalized_data = output.select("Independent_features", label)

            finalized_data.show()

            # splitting the dataset into taining and testing

            train_data, test_data = finalized_data.randomSplit([0.75, 0.25],
                                                               seed=40)

            ######################################################################33
            # lasso final
            for t in self.xt:
                lr1 = LinearRegression(featuresCol="Independent_features",
                                       labelCol=label,
                                       elasticNetParam=0,
                                       regParam=t)
                regressor1 = lr1.fit(train_data)
                print(t)
                print("coefficient : " + str(regressor1.coefficients))
                reg_sum = regressor1.summary
                r2 = reg_sum.r2
                Rsqr_list.append(r2)
                Rsqr_regPara[r2] = t
                print(r2)

            print(Rsqr_list)
            print(max(Rsqr_list))
            maximum_rsqr = max(Rsqr_list)
            print(Rsqr_regPara)
            final_regPara = []

            for key, val in Rsqr_regPara.items():
                if (key == maximum_rsqr):
                    print(val)
                    final_regPara.append(val)

            for reg in final_regPara:
                lr_lasso = LinearRegression(featuresCol="Independent_features",
                                            labelCol=label,
                                            elasticNetParam=0,
                                            regParam=reg)
                regressor = lr_lasso.fit(train_data)
                training_summary = regressor.summary
                r2 = training_summary.r2
                print(r2)

            print("coefficient : " + str(regressor.coefficients))
            coefficient_t = str(regressor.coefficients)

            print("intercept : " + str(regressor.intercept))
            intercept_t = str(regressor.intercept)

            prediction = regressor.evaluate(test_data)

            prediction_val = prediction.predictions
            prediction_val.show()

            prediction_val_pand = prediction_val.select(
                label, "prediction").toPandas()

            prediction_val_pand = prediction_val_pand.assign(
                residual_vall=prediction_val_pand[label] -
                prediction_val_pand["prediction"])

            prediction_val_pand_residual = prediction_val_pand["residual_vall"]

            prediction_val_pand_label = prediction_val_pand[label]

            # print prediction_val_pand_residual
            prediction_val_pand_predict = prediction_val_pand["prediction"]
            # print prediction_val_pand_predict

            # test_summary = prediction.summary

            # for test data

            lr_prediction = regressor.transform(test_data)

            lr_prediction.groupBy(label, "prediction").count().show()

            lr_prediction_quantile = lr_prediction.select(label, "prediction")
            lr_prediction_onlypred = lr_prediction.select('prediction')
            # lr_prediction_quantile.show()

            # training_summary = regressor.summary

            print("numof_Iterations...%d\n" % training_summary.totalIterations)
            print("ObjectiveHistory...%s\n" %
                  str(training_summary.objectiveHistory))
            print("RMSE...%f\n" % training_summary.rootMeanSquaredError)
            RMSE = training_summary.rootMeanSquaredError
            print("MSE....%f\n" % training_summary.meanSquaredError)
            MSE = training_summary.meanSquaredError
            print("r**2(r-square)....::%f\n" % training_summary.r2)
            r_square = training_summary.r2
            print("r**2(r-square adjusted)....%f\n" % training_summary.r2adj)
            adjsted_r_square = training_summary.r2adj
            print("deviance residuals %s" %
                  str(training_summary.devianceResiduals))
            training_summary.residuals.show()
            # residual_graph = training_summary.residuals
            # test = (residual_graph, lr_prediction_onlypred)
            # residual_graph.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode='append' )
            # print(test)
            # test.write.csv('/home/fidel/PycharmProjects/predictive_analysis_git', header=True, mode= 'append')
            # residual_graph_pandas = residual_graph.toPandas()
            print("coefficient standard errors: \n" +
                  str(training_summary.coefficientStandardErrors))
            coefficient_error = str(training_summary.coefficientStandardErrors)
            print(" Tvalues :\n" + str(training_summary.tValues))
            T_values = str(training_summary.tValues)
            print(" p values :\n" + str(training_summary.pValues))
            P_values = str(training_summary.pValues)

            #######################################################################################################
            table_response = {
                "Intercept": intercept_t,
                "Coefficients": coefficient_t,
                "RMSE": RMSE,
                "MSE": MSE,
                "R_square": r_square,
                "Adj_R_square": adjsted_r_square,
                "Coefficient_error": coefficient_error,
                "T_value": T_values,
                "P_value": P_values
            }
            #######################################################################################################
            # residual  vs fitted graph

            prediction_data = regressor.summary.predictions
            prediction_data.show()
            prediction_data.select(['prediction']).show()
            predicted = prediction_data.select(['prediction'])
            regressor.summary.residuals.show()
            residuals = regressor.summary.residuals
            pred_d = predicted.withColumn('row_index',
                                          f.monotonically_increasing_id())
            res_d = residuals.withColumn('row_index',
                                         f.monotonically_increasing_id())

            pred_residuals = pred_d.join(
                res_d, on=['row_index']).sort('row_index').drop('row_index')
            pred_residuals.show()

            pred_residuals.write.parquet(
                'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/residual_fitted_train.parquet',
                mode='overwrite')

            ######################################################################################
            # scale location plot training data

            from pyspark.sql.functions import sqrt

            from pyspark.sql.functions import abs as ab

            df_label = prediction_data.select(
                label, 'prediction',
                sqrt(ab(prediction_data[label])).alias("sqrt_label"))

            df_label.show()
            df_sqrt_label_index = df_label.withColumn(
                'row_index', f.monotonically_increasing_id())
            df_sqrt_label_index.show()
            # df_residual_index = df_residual.withColumn('row_index', f.monotonically_increasing_id())
            # df_residual_index.show()
            res_d.show()
            sqrt_label_residual_join = df_sqrt_label_index.join(
                res_d, on=['row_index']).sort('row_index').drop('row_index')

            sqrt_label_residual_join.show()

            std_resid = sqrt_label_residual_join.select(
                'sqrt_label', 'prediction',
                (sqrt_label_residual_join['residuals'] /
                 sqrt_label_residual_join['sqrt_label']).alias('std_res'))

            std_resid.show()
            # std_resid_std_res = std_resid.select("std_res")

            sqrt_std_res = std_resid.select(
                "std_res", 'prediction',
                sqrt(ab(std_resid["std_res"])).alias("sqrt_std_resid"))

            # sqrt_std_res = sqrt(abs(std_resid_std_res["std_res"]))
            sqrt_std_res.show()
            sqrt_std_res_fitted = sqrt_std_res.select('prediction',
                                                      'sqrt_std_resid')

            sqrt_std_res_fitted.write.parquet(
                'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/scale_location_train.parquet',
                mode='overwrite')

            ######################################################################################
            # QUANTILE
            '''
            from pyspark.sql.functions import *
            
            res_d.show()
            sorted_res = res_d.sort('residuals')
            sorted_res.show()
            stdev_ress = sorted_res.select(stddev(col('residuals')).alias('std_dev'),mean(col('residuals')).alias('mean'))
            stdev_ress.show()
            mean_residual = stdev_ress.select(['mean']).toPandas()
            stddev_residual = stdev_ress.select(['std_dev']).toPandas()
            
            for x in range(0, 5):
                print(x/mean_residual)
            

            
            '''
            ####################################################################################
            # appending predicted value to the dataset
            target = dataset.select(label)
            pred = prediction_data.select(['prediction'])
            pred_d = pred.withColumn('row_index',
                                     f.monotonically_increasing_id())
            target_d = target.withColumn('row_index',
                                         f.monotonically_increasing_id())

            pred_target = pred_d.join(target_d,
                                      on=['row_index']).drop('row_index')
            pred_target.show()

            dataset.show()

            pred_target_data_update = dataset.join(pred_target, on=[label])

            pred_target_data_update.show(100)

            ###########################################################

            import matplotlib.pyplot as plt

            y = 0.1
            x = []

            for i in range(0, 90):
                x.append(y)
                y = round(y + 0.01, 2)
            #
            # for z in x:
            #     print ("~~~~~   ",z)
            #

            quantile_label = lr_prediction_quantile.approxQuantile(
                label, x, 0.01)
            # print quantile_label
            quantile_prediction = lr_prediction_quantile.approxQuantile(
                "prediction", x, 0.01)

            # creating the csv file and writitng into it

            fitted_residual = ''
            print(len(prediction_val_pand_residual))
            length = len(prediction_val_pand_residual)

            for i in range(0, len(prediction_val_pand_residual)):
                fitted_residual += str(
                    prediction_val_pand_predict[i]) + 't' + str(
                        prediction_val_pand_residual[i]) + 'n'

            with open('residual_vs_fitted.csv', 'w') as r_f:
                writer_r_f = csv.writer(r_f)
                writer_r_f.writerows((prediction_val_pand_predict,
                                      prediction_val_pand_residual))

            # parquet file writing

            ## residual vs leverage graph data

            prediction_val_pand_residual
            # extreme value in the predictor colm
            prediction_col_extremeval = lr_prediction_quantile.agg(
                {"prediction": "max"})
            # prediction_col_extremeval.show()

            # plt.plot(prediction_col_extremeval, prediction_val_pand_residual)
            # plt.show()

            ## scale location graph data
            import math

            prediction_val_pand_residual
            prediction_val_pand_predict
            prediction_val_pand_residual_abs = prediction_val_pand_residual.abs(
            )
            sqrt_residual = []
            for x in prediction_val_pand_residual_abs:
                sqrt_residual.append(math.sqrt(x))
                # print ("____________________  ",x)

            sqrt_residual

            # plt.scatter(sqrt_residual, prediction_val_pand_predict)
            ####################################################################################3

            # calculating std deviation
            import statistics
            print(statistics.stdev(prediction_val_pand_residual))
            stdev_pred = statistics.stdev(prediction_val_pand_residual)
            # mean = statistics.mean(prediction_val_pand_residual)

            # calcuate stnd residuals
            std_res = []
            for x in prediction_val_pand_residual:
                std_res.append(x / stdev_pred)
            print(std_res)

            # calculating the square root of std_res
            import math
            sqr_std_res = []
            for x in std_res:
                sqr_std_res.append(math.sqrt(abs(x)))
            print(sqr_std_res)
            #######################################################################################3
            # QUANTILE

            ## sort the list
            sorted_std_res = sorted(std_res)
            print(sorted_std_res)
            #
            mean = statistics.mean(sorted_std_res)
            stdev = statistics.stdev(sorted_std_res)
            print(mean)
            quantile = []
            n = len(sorted_std_res)
            print(n)
            for x in range(0, n):
                quantile.append((x - 0.5) / (n))

            print(quantile)
            #
            # z_score theoritical
            from scipy.stats import norm

            z_theory = []
            for x in quantile:
                z_theory.append((norm.ppf(abs(x))))
            print(z_theory)
            # z score for real val
            z_pract = []
            for x in sorted_std_res:
                z_pract.append((x - mean) / stdev)

            #

            y = 0.1
            x = []

            for i in range(0, 90):
                x.append(y)
                y = round(y + 0.01, 2)

            quantile_std_res = spark.createDataFrame(std_res, FloatType())
            quantile_std_res.show()
            quantile_std_res_t = quantile_std_res.approxQuantile(
                'value', x, 0.01)
            print(quantile_std_res_t)
            print(x)

            Q_label_pred = ''
            print(len(quantile_label))
            length = len(quantile_label)
            for quant, val in zip(z_theory, z_pract):
                Q_label_pred += str(val) + 't' + str(quant) + 'n'

            plt.scatter(z_theory, z_pract)
            plt.savefig('q_q')

            ####################################################

            # creating the std residuals

            # square root of label
            sqrt_label = []
            for x in prediction_val_pand_label:
                sqrt_label.append(math.sqrt(abs(x)))

            sqrt_label
            prediction_val_pand_residual
            std_residual = []
            for sqr, resid in zip(sqrt_label, prediction_val_pand_residual):
                std_residual.append(resid / sqr)
                # print(std_sqrt_residual)

            # creating the std sqr root

            sqrt_std_residuals = []
            for x in std_residual:
                # print(math.sqrt(abs(x)))
                sqrt_std_residuals.append(math.sqrt(abs(x)))
            print(sqrt_std_residuals)

            # print(std_sqrt_residual)

            scale_predict_residual = ''
            for pre, res in zip(prediction_val_pand_predict,
                                sqrt_std_residuals):
                scale_predict_residual += str(pre) + 't' + str(res) + 'n'
            print(scale_predict_residual)

            ##########################################################################
            """
            pred_residuals.show()
            pred_residuals_pandas = pred_residuals.toPandas()
            print(pred_residuals_pandas)
            res_pandas = pred_residuals_pandas['residuals']
            pred_pandas = pred_residuals_pandas['prediction']
            label_list = []
            # for res, pred in zip(res_pandas, pred_pandas):
            #     label_list.append(res+pred)
            label_pand = prediction_data.select([label]).toPandas()
            labe_panda = label_pand[label]

            # sqrt of label column
            sqrt_lab = []
            for lab in labe_panda:
                sqrt_lab.append(math.sqrt(abs(lab)))

            print(res_pandas)
            stdev_res = statistics.stdev(res_pandas)
            std_res_list = []
            for valr, labe in zip(res_pandas,sqrt_lab):
                std_res_list.append(valr/labe)
            print(std_res_list)
            """

            ##########################################################################
            ##########################################################################
            # import math
            # sqrt_stdres = []
            # for x in std_sqrt_residual:
            #     sqrt_stdres.append(math.sqrt(x))
            #
            # scale_predict_residual = ''
            # for pre, res in zip(prediction_val_pand_predict, sqrt_stdres):
            #     scale_predict_residual += str(pre) + 't' + str(res) + 'n'
            # print(scale_predict_residual)

            ###################################3

            # plt.show()

            # scale_predict_residual=''
            #
            # print(len(sqrt_residual))
            # length = len(sqrt_residual)
            #
            # for i in range(0, len(std_sqrt_residual)):
            #     scale_predict_residual += str(prediction_val_pand_predict[i]) + '|' + str(std_sqrt_residual[i]) + '\n'

            # with open('scale_location_plot.csv', 'w') as s_l:
            #     writer_s_l = csv.writer(s_l)
            #     writer_s_l.writerows((prediction_val_pand_predict, sqrt_residual))

            # writing to the parquet

            # prediction_val_pand_predict_tospark = spark.createDataFrame(prediction_val_pand_predict, FloatType())
            # prediction_val_pand_predict_tospark = prediction_val_pand_predict_tospark.withColumnRenamed("value",
            #                                                                                             "prediction")
            #
            # sqrt_residual_tospark= spark.createDataFrame(sqrt_residual, FloatType())
            # sqrt_residual_tospark = sqrt_residual_tospark.withColumnRenamed("value",
            #                                                                                               "sqrt_residual")
            #
            # pred_spark = prediction_val_pand_predict_tospark.withColumn('row_index', f.monotonically_increasing_id())
            # res_spark = sqrt_residual_tospark.withColumn('row_index', f.monotonically_increasing_id())
            #
            # final_scale_fitted = pred_spark.join(res_spark,on=['row_index']) \
            #     .sort('row_index').drop('row_index')
            #
            # final_scale_fitted.show()
            #
            # final_scale_fitted.write.parquet(
            #     'hdfs://10.171.0.181:9000/dev/dmxdeepinsight/datasets/SCALE_LOCATION_PLOT.parquet',
            #     mode='overwrite')
            #

            # dumping the dictionary into json object

            # json_response = {'run_status': 'success', 'PredictiveResponse': resultdf}

            graph_response = {
                "Q_Q_plot": Q_label_pred,
                "residual_fitted": fitted_residual,
                "scale_location": scale_predict_residual
            }

            json_response = {
                'table_data': table_response,
                'graph_data': graph_response
            }

            return json_response

        except Exception as e:
            print('exception is =' + str(e))
Beispiel #28
0
def tocolumns(df, expr):
    import pyspark.sql.functions as fcns

    if isinstance(expr, histbook.expr.Const):
        return fcns.lit(expr.value)

    elif isinstance(expr, (histbook.expr.Name, histbook.expr.Predicate)):
        return df[expr.value]

    elif isinstance(expr, histbook.expr.Call):
        if expr.fcn == "abs" or expr.fcn == "fabs":
            return fcns.abs(tocolumns(df, expr.args[0]))
        elif expr.fcn == "max" or expr.fcn == "fmax":
            return fcns.greatest(*[tocolumns(df, x) for x in expr.args])
        elif expr.fcn == "min" or expr.fcn == "fmin":
            return fcns.least(*[tocolumns(df, x) for x in expr.args])
        elif expr.fcn == "arccos":
            return fcns.acos(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arccosh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "arcsin":
            return fcns.asin(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arcsinh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "arctan2":
            return fcns.atan2(tocolumns(df, expr.args[0]),
                              tocolumns(df, expr.args[1]))
        elif expr.fcn == "arctan":
            return fcns.atan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arctanh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "ceil":
            return fcns.ceil(tocolumns(df, expr.args[0]))
        elif expr.fcn == "copysign":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "cos":
            return fcns.cos(tocolumns(df, expr.args[0]))
        elif expr.fcn == "cosh":
            return fcns.cosh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "rad2deg":
            return tocolumns(df, expr.args[0]) * (180.0 / math.pi)
        elif expr.fcn == "erfc":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "erf":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "exp":
            return fcns.exp(tocolumns(df, expr.args[0]))
        elif expr.fcn == "expm1":
            return fcns.expm1(tocolumns(df, expr.args[0]))
        elif expr.fcn == "factorial":
            return fcns.factorial(tocolumns(df, expr.args[0]))
        elif expr.fcn == "floor":
            return fcns.floor(tocolumns(df, expr.args[0]))
        elif expr.fcn == "fmod":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "gamma":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "hypot":
            return fcns.hypot(tocolumns(df, expr.args[0]),
                              tocolumns(df, expr.args[1]))
        elif expr.fcn == "isinf":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "isnan":
            return fcns.isnan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "lgamma":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "log10":
            return fcns.log10(tocolumns(df, expr.args[0]))
        elif expr.fcn == "log1p":
            return fcns.log1p(tocolumns(df, expr.args[0]))
        elif expr.fcn == "log":
            return fcns.log(tocolumns(df, expr.args[0]))
        elif expr.fcn == "pow":
            return fcns.pow(tocolumns(df, expr.args[0]),
                            tocolumns(df, expr.args[1]))
        elif expr.fcn == "deg2rad":
            return tocolumns(df, expr.args[0]) * (math.pi / 180.0)
        elif expr.fcn == "sinh":
            return fcns.sinh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sin":
            return fcns.sin(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sqrt":
            return fcns.sqrt(tocolumns(df, expr.args[0]))
        elif expr.fcn == "tanh":
            return fcns.tanh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "tan":
            return fcns.tan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "trunc":
            raise NotImplementedError(
                expr.fcn)  # FIXME (fcns.trunc is for dates)
        elif expr.fcn == "xor":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "conjugate":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "exp2":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "heaviside":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "isfinite":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "left_shift" and isinstance(expr.args[1],
                                                     histbook.expr.Const):
            return fcns.shiftLeft(tocolumns(df, expr.args[0]),
                                  expr.args[1].value)
        elif expr.fcn == "log2":
            return fcns.log2(tocolumns(df, expr.args[0]))
        elif expr.fcn == "logaddexp2":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "logaddexp":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "mod" or expr.fcn == "fmod":
            return tocolumns(df, expr.args[0]) % tocolumns(df, expr.args[1])
        elif expr.fcn == "right_shift" and isinstance(expr.args[1],
                                                      histbook.expr.Const):
            return fcns.shiftRight(tocolumns(df, expr.args[0]),
                                   expr.args[1].value)
        elif expr.fcn == "rint":
            return fcns.rint(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sign":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "where":
            return fcns.when(tocolumns(df, expr.args[0]),
                             tocolumns(df, expr.args[1])).otherwise(
                                 tocolumns(df, expr.args[2]))
        elif expr.fcn == "numpy.equal":
            return tocolumns(df, expr.args[0]) == tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.not_equal":
            return tocolumns(df, expr.args[0]) != tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.less":
            return tocolumns(df, expr.args[0]) < tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.less_equal":
            return tocolumns(df, expr.args[0]) <= tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.isin":
            return tocolumns(df, expr.args[0]) in tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_not":
            return ~tocolumns(df, expr.args[0])
        elif expr.fcn == "numpy.add":
            return tocolumns(df, expr.args[0]) + tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.subtract":
            return tocolumns(df, expr.args[0]) - tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.multiply":
            return tocolumns(df, expr.args[0]) * tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.true_divide":
            return tocolumns(df, expr.args[0]) / tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_or":
            return tocolumns(df, expr.args[0]) | tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_and":
            return tocolumns(df, expr.args[0]) & tocolumns(df, expr.args[1])
        else:
            raise NotImplementedError(expr.fcn)

    else:
        raise AssertionError(expr)
Beispiel #29
0
        # count = sum of daily counts
        feat + 'count' + dd:
        f.sum(f.col(feat + 'count_0d')).over(window),

        # A few more complicated examples:

        # mean = weighted mean of daily means
        feat + 'count' + dd:
        f.sum(f.col(feat + 'mean_0d') * f.col(feat + 'count_0d')).over(window)
        / f.sum(f.col(feat + 'count_0d')).over(window),

        # stddev = sqrt(weighted mean of daily variances)
        feat + 'stddev' + dd:
        f.sqrt(
            f.mean(f.col(feat + 'count_0d') *
                   f.col(feat + 'stddev_0d')**2).over(window) /
            f.sum(f.col(feat + 'count_0d')).over(window)),
    }

    # Loop through the dictionary of new columns and add them to the aggregated
    # dataframe
    for col_name, col_obj in new_cols.items():
        add = SparkWithColumn(name='add_' + col_name,
                              read_key='df_agg',
                              store_key='df_agg',
                              new_col_name=col_name,
                              new_col=col_obj)

        lookback_chain.add(add)

# STEP 5: Save the results
Beispiel #30
0
 def skewness_custom(column, mean, count):
     return ((np.sqrt(count) * df_sum(df_pow(column - mean, int(3)))) /
             df_pow(sqrt(df_sum(df_pow(column - mean, int(2)))), 3))
histData = histData.groupBy("User", "device").agg(*expr).show()

# COMMAND ----------

# Question 2 - Using the dataset “activity-data”, create a stream that outputs in one table the total number of meters user g travels per activity in time intervals of resp. 15 minutes and 30 minutes. Order the table by the most distance travelled per activity. Hint: you can use the columns x, y, z to calculate the distance travelled

# COMMAND ----------

staticDF.show(4)

# COMMAND ----------

from pyspark.sql.functions import sqrt

totalDist = streamingDF.select("User", "x", "y", "z").withColumn("Distance", sqrt(pow((streamingDF['x']), 2) + pow((streamingDF['y']), 2) + pow((streamingDF['y']), 2)))

user_g_dist = totalDist\
  .cube("User").sum("distance")\
  .where("User == 'g'")\
  .writeStream\
  .queryName("user_g_distance")\
  .format("memory")\
  .outputMode("complete")\
  .start()

# COMMAND ----------

from time import sleep
for x in range(5):
    spark.sql("select * from user_g_distance").show(3)
Beispiel #32
0
 def skewness_custom(column, mean, count):
     return ((np.sqrt(count) * df_sum(df_pow(column - mean, int(3)))) / df_pow(sqrt(df_sum(df_pow(column - mean, int(2)))),3))
    spark = SparkSession.builder.appName("RLSRateSourceOLS").getOrCreate()
    spark.sparkContext.setLogLevel("WARN")

    # OLS problem, states to be estimated are a, b and c
    # z = a*x + b * y + c + w, where w ~ N(0, 1)
    a = 0.5
    b = 0.2
    c = 1.2
    noise_param = 1
    label_expression = F.col("x") * a + F.col("y") * b + c + F.col("w")

    input_df = spark.readStream.format("rate").option("rowsPerSecond", mps).load()\
        .withColumn("mod", F.col("value") % num_states)\
        .withColumn("stateKey", F.col("mod").cast("String"))\
        .withColumn("x", (F.col("value")/num_states).cast("Integer").cast("Double"))\
        .withColumn("y", F.sqrt("x"))\
        .withColumn("bias", F.lit(1.0))\
        .withColumn("w", F.randn(0) * noise_param)\
        .withColumn("label", label_expression)

    rls = RecursiveLeastSquaresFilter(3)\
        .setStateKeyCol("stateKey")\
        .setRegularizationMatrixFactor(10E6)\
        .setForgettingFactor(0.99)

    assembler = VectorAssembler(inputCols=["x", "y", "bias"],
                                outputCol="features")

    measurements = assembler.transform(input_df)
    query = rls.transform(measurements)\
        .writeStream\