Ejemplo n.º 1
0
    def query_random_sample(self,
                            data_frame,
                            table_name,
                            query_str,
                            sample_per=0.1):
        """
        get query data from spark
        :param table_name: name of table you want to get data
        :param query_str: sql strings
        :return: query result as json Object
        """
        try:
            sc = self.spark_session_create("query_radom_sample")
            tfmsa_logger("start query data !")
            hdfs_path = settings.HDFS_DF_ROOT + "/" + data_frame + "/" + table_name

            sqlContext = SQLContext(sc)
            df = sqlContext.read.load(hdfs_path, "parquet")
            df.registerTempTable(table_name)
            result = sqlContext.sql(str(query_str)).sample(False,
                                                           float(sample_per),
                                                           seed=0).collect()

            return result
        except Exception as e:
            tfmsa_logger(e)
            raise Exception(e)
        finally:
            df.unpersist()
            sqlContext.clearCache()
            sqlContext.dropTempTable(table_name)
            sc.clearFiles()
            sc.stop()
            tfmsa_logger("stop context")
Ejemplo n.º 2
0
    def get_distinct_dataframe(self, data_frame, table_name, columns):
        """
        get distinct table columns
        :param table_name: name of table you want to get data
        :param query_str: sql strings
        :return: query result as json Object
        """
        try:
            sc = self.spark_session_create("get_distinct_dataframe")
            tfmsa_logger("start find distinct column !")
            hdfs_path = settings.HDFS_DF_ROOT + "/" + data_frame + "/" + table_name
            query_str = "select * from " + table_name

            sqlContext = SQLContext(sc)

            df = sqlContext.read.load(hdfs_path, "parquet")
            df.registerTempTable(table_name)
            result = sqlContext.sql(str(query_str))

            return_data = {}
            for column in columns:
                return_data[column.encode("UTF8")] = result.select(column).map(
                    lambda x: str(x[0]).encode("UTF8")).distinct().collect()

            tfmsa_logger("End find distinct column !")

            return return_data

        except Exception as e:
            tfmsa_logger(e)
            raise Exception(e)
        finally:
            df.unpersist()
            sqlContext.clearCache()
            sqlContext.dropTempTable(table_name)
            sc.clearFiles()
            sc.stop()
            tfmsa_logger("stop context")
Ejemplo n.º 3
0
def process(rdd):
    """
    :param rdd: (user_id, movie_id)
    :return: (user_id, [movie_id, ...])
    """
    if rdd.isEmpty():
        return rdd

    spark = getSparkSessionInstance(rdd.context.getConf())
    ssc = SQLContext(spark.sparkContext)

    # 80
    # update recommend table with newest; create dataframe of recommend table
    tmp = rdd.collect()
    for item in tmp:
        collect.update_one({"user_id": item[0]},
                           {"$addToSet": {
                               "movie_list": item[1]
                           }})

    user_list = rdd.map(lambda x: [x[0]]).reduce(reducer)
    tmp_list = []
    for item in collect.find({'user_id': {'$in': user_list}}):
        tmp_list.append((item['user_id'], item['movie_list']))
    tmp_df = ssc.createDataFrame(tmp_list, ['user_id', 'movie_list'])
    ssc.registerDataFrameAsTable(tmp_df, 'recommend')

    # 82: get the whole recommend list of incoming user
    input_rdd = sc.parallelize(tmp_list).flatMap(mapper3)
    input_df = spark.createDataFrame(input_rdd, ['user_id', 'movie_id'])
    ssc.registerDataFrameAsTable(input_df, 'input_rating')

    current_rdd = ssc.sql('''SELECT i.user_id, s.*
                        FROM input_rating i, similar s
                        WHERE i.movie_id == s.popular_id ''').rdd

    # 86
    total_rdd = current_rdd.flatMap(mapper7)\
        .reduceByKey(reducer)\
        .flatMap(mapper8)

    # 88: sort the recommend list with avg_rating
    recommend_df = spark.createDataFrame(total_rdd, ['user_id', 'movie_id'])
    ssc.registerDataFrameAsTable(recommend_df, 'user_new')
    movie_rating_df = ssc.sql(
        '''SELECT u.user_id, u.movie_id, m.rating_avg, m.title
                                 FROM user_new u, movies m
                                 WHERE u.movie_id == m.movie_id''')
    movie_rating_rdd = movie_rating_df.rdd\
        .map(tuple)\
        .map(mapper4)\
        .reduceByKey(reducer)\
        .map(mapper5)

    # 90: filter out recommend items that are already in table recommend.movie_list
    #     truncate to RECOMMEND_LENGTH
    movie_rating_df2 = ssc.createDataFrame(movie_rating_rdd,
                                           ['user_id', 'sort_list'])
    ssc.registerDataFrameAsTable(movie_rating_df2, 'sorted')
    movie_list_rdd = ssc.sql('''SELECT s.user_id, s.sort_list, r.movie_list
                                FROM sorted s, recommend r
                                WHERE s.user_id == r.user_id''').rdd
    filtered_movie_rdd = movie_list_rdd.map(mapper6)
    recommend_list = filtered_movie_rdd.map(mapper9).collect()

    # 92
    for item in recommend_list:
        collect.update_one({"user_id": item[0]},
                           {"$set": {
                               "recommend_list": item[1]
                           }})

    # clean up temporary tables
    ssc.dropTempTable('input_rating')
    ssc.dropTempTable('user_new')
    ssc.dropTempTable('recommend')
    ssc.dropTempTable('sorted')

    return filtered_movie_rdd
Ejemplo n.º 4
0
def sql_context_api(spark):

    sc = spark.sparkContext
    sqlContext = SQLContext(sc)

    print("Start running SQL context API")

    # createDataFrame

    l = [('Alice', 1)]
    sqlContext.createDataFrame(l).collect()
    res = sqlContext.createDataFrame(l, ['name', 'age']).collect()
    print(res)
    rdd = sc.parallelize(l)
    sqlContext.createDataFrame(rdd).collect()
    df = sqlContext.createDataFrame(rdd, ['name', 'age'])
    res = df.collect()
    print(res)
    print("createDataFrame API finished")

    # table and cache
    df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80),
                                ('Alice', 10, 80)], ["name", "age", "height"])
    sqlContext.registerDataFrameAsTable(df, "table1")
    sqlContext.cacheTable("table1")
    sqlContext.uncacheTable("table1")
    sqlContext.cacheTable("table1")
    sqlContext.clearCache()
    # sqlContext.createExternalTable("table1", schema = df2)
    sqlContext.dropTempTable("table1")
    # res = df2.collect()
    # print(res)
    print("External, TempTable and cache API finished")

    # getConf
    res = sqlContext.getConf("spark.sql.shuffle.partitions")
    print(res)
    res = sqlContext.getConf("spark.sql.shuffle.partitions", u"10")
    print(res)
    sqlContext.setConf("spark.sql.shuffle.partitions", u"50")
    res = sqlContext.getConf("spark.sql.shuffle.partitions", u"10")
    print(res)
    print("getConf API finished")

    # newSession
    newspark = sqlContext.newSession()
    print("newSession API finished")

    # range
    res = sqlContext.range(1, 7, 2).collect()
    print(res)
    res = sqlContext.range(3).collect()
    print(res)
    print("range API finished")

    # read
    res = sqlContext.read
    # text_sdf = sqlContext.readStream.text(tempfile.mkdtemp())
    # res = text_sdf.isStreaming
    # print(res)
    print("read and readStream API finished")

    # register

    # sql
    df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80),
                                ('Alice', 10, 80)], ["name", "age", "height"])
    sqlContext.registerDataFrameAsTable(df, "table1")
    df2 = sqlContext.sql("SELECT name AS f1, age as f2 from table1")
    res = df2.collect()
    print(res)
    print("sql API finished")

    # table
    df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80),
                                ('Alice', 10, 80)], ["name", "age", "height"])
    sqlContext.registerDataFrameAsTable(df, "table1")
    df2 = sqlContext.table("table1")
    res = (sorted(df.collect()) == sorted(df2.collect()))
    print(res)
    print("table API finished")

    # tableNames
    df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80),
                                ('Alice', 10, 80)], ["name", "age", "height"])
    sqlContext.registerDataFrameAsTable(df, "table1")
    res = ("table1" in sqlContext.tableNames())
    print(res)
    res = ("table1" in sqlContext.tableNames("default"))
    print(res)
    print("tableNames API finished")

    # tables
    sqlContext.registerDataFrameAsTable(df, "table1")
    df2 = sqlContext.tables()
    res = df2.filter("tableName = 'table1'").first()
    print(res)
    print("tables API finished")

    print("Finish running SQL context API")
# In[9]:

#change dataframe to a table
temp.registerTempTable('df')

# In[10]:

#use sql command to deal with table defined
sqlContext = SQLContext(sc)
temp = sqlContext.sql(
    "select *,(case when activity = '送礼物' then 12 when activity='评论' then 10 when activity='播放' then 2 when activity='下载' then 6 end) as score from df"
)

# In[31]:

sqlContext.dropTempTable("df")

# In[13]:

data = temp.groupby(['user_id_x', 'user_work_id']).agg({'score': 'sum'})
data.show(5)

# In[43]:

data = data.withColumnRenamed("sum(score)", "score")

# In[61]:

data.count()

# In[46]:
Ejemplo n.º 6
0
def sql_context_api(spark):

    sc = spark.sparkContext
    sqlContext = SQLContext(sc)

    print("Start running SQL context API")

    # createDataFrame
    l = [('Alice', 1)]
    sqlContext.createDataFrame(l).collect()
    res = sqlContext.createDataFrame(l, ['name', 'age']).collect()
    print(res)
    rdd = sc.parallelize(l)
    sqlContext.createDataFrame(rdd).collect()
    df = sqlContext.createDataFrame(rdd, ['name', 'age'])
    res = df.collect()
    print(res)
    print("createDataFrame API finished")

    # table and cache
    df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80),
                                ('Alice', 10, 80)], ["name", "age", "height"])
    sqlContext.registerDataFrameAsTable(df, "table1")
    sqlContext.cacheTable("table1")
    sqlContext.uncacheTable("table1")
    sqlContext.cacheTable("table1")
    sqlContext.clearCache()
    # sqlContext.createExternalTable("table1", schema = df2)
    sqlContext.dropTempTable("table1")
    # res = df2.collect()
    # print(res)
    print("External, TempTable and cache API finished")

    # getConf
    res = sqlContext.getConf("spark.sql.shuffle.partitions")
    print(res)
    res = sqlContext.getConf("spark.sql.shuffle.partitions", u"10")
    print(res)
    sqlContext.setConf("spark.sql.shuffle.partitions", u"50")
    res = sqlContext.getConf("spark.sql.shuffle.partitions", u"10")
    print(res)
    print("getConf API finished")

    # newSession
    newspark = sqlContext.newSession()
    print("newSession API finished")

    # range
    res = sqlContext.range(1, 7, 2).collect()
    print(res)
    res = sqlContext.range(3).collect()
    print(res)
    print("range API finished")

    # read
    res = sqlContext.read
    text_sdf = sqlContext.readStream.text(
        "/ppml/trusted-big-data-ml/work/examples/helloworld.py")
    res = text_sdf.isStreaming
    print(res)
    print("read and readStream API finished")

    # sql
    df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80),
                                ('Alice', 10, 80)], ["name", "age", "height"])
    sqlContext.registerDataFrameAsTable(df, "table1")
    df2 = sqlContext.sql("SELECT name AS f1, age as f2 from table1")
    res = df2.collect()
    print(res)
    print("sql API finished")

    # table
    df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80),
                                ('Alice', 10, 80)], ["name", "age", "height"])
    sqlContext.registerDataFrameAsTable(df, "table1")
    df2 = sqlContext.table("table1")
    res = (sorted(df.collect()) == sorted(df2.collect()))
    print(res)
    print("table API finished")

    # tableNames
    df = spark.createDataFrame([('Alice', 5, 80), ('Alice', 5, 80),
                                ('Alice', 10, 80)], ["name", "age", "height"])
    sqlContext.registerDataFrameAsTable(df, "table1")
    res = ("table1" in sqlContext.tableNames())
    print(res)
    res = ("table1" in sqlContext.tableNames("default"))
    print(res)
    print("tableNames API finished")

    # tables
    sqlContext.registerDataFrameAsTable(df, "table1")
    df2 = sqlContext.tables()
    res = df2.filter("tableName = 'table1'").first()
    print(res)
    print("tables API finished")

    # register
    # strlen = sqlContext.registerFunction("stringLengthString", lambda x: len(x))
    # res = spark.sql("SELECT stringLengthString('test')").collect()
    # print(res)
    spark.udf.registerJavaFunction("javaStringLength3",
                                   "org.apache.spark.sql.JavaStringLength",
                                   "integer")
    res = spark.sql("SELECT javaStringLength3('test')").collect()
    print(res)
    print("register API finished")

    print("Finish running SQL context API")
Ejemplo n.º 7
0
     StructField("age", IntegerType(), True)
 ])
 res3 = sqlContext.createDataFrame(rdd, schema)
 row = res3.collect()
 print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 5th xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
 print(row)
 # -------------------------------------------- 现在该示例未能执行成功,错误消息为:ImportError: No module named pandas
 # row = sqlContext.createDataFrame(df.toPandas()).collect()
 # print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 6th xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
 # print(row)
 # -------------------------------------------- createExternalTable(tableName, path=None, source=None, schema=None, **options) and drop it
 print(
     "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 7th xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
 )
 sqlContext.registerDataFrameAsTable(df, "stu5")
 sqlContext.dropTempTable("stu5")
 # ---------------------------------------------- json file 待测
 # ----------------------------------------------
 print(
     "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 8th xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
 )
 sqlContext.registerDataFrameAsTable(df, "stu6")
 df2 = sqlContext.sql("SELECT name AS f1, age as f2 from stu6")
 print(df2.collect())
 sqlContext.table("stu6")
 print(sorted(df.collect()) == sorted(df2.collect()))  # True
 # -------------------------------------------------------
 print("stu6" in sqlContext.tableNames())  # True
 print("stu3" in sqlContext.tableNames())  # False
 # -------------------------------------------------------
 sqlContext.registerDataFrameAsTable(df, "stu5")
Ejemplo n.º 8
0
def main(sc):
    sqlContext = SQLContext(sc)
    # In[1]:
    input_path = ''
    model_path = ''
    model_info_path = model_path + ''
    model_scaler_path = model_path + ''
    model_train_set_path = model_path + ''

    # Import the table of features and labels into dataframes
    df_data = sqlContext.read.format('com.databricks.spark.csv').options(
        header='true', inferschema='true').load(input_path)

    # Convert all features to double type except for ID and Label, which remain as strings
    # This is done because the Random Forest Algorithm requires features to be numbers
    df_data = df_data.select(
        *(col(c).cast("double").alias(c) for c in df_data.columns[1:-1]),
        df_data.u_msisdn.cast('string'), df_data.tag.cast('string'))

    # Defines that the first column is the unique ID, the last one contains the labels and all the ones in between are the given features
    df_master = df_data.rdd.map(lambda r: Row(
        cust_id=r[-2], label=r[-1], features=Vectors.dense(r[:-2]))).toDF()

    # Randomly Split the data into a test and train set
    (df_master_train, df_master_test) = df_master.randomSplit([0.5, 0.5],
                                                              seed=123)

    # Set the Random Forest input to the training set
    rf_init_data = df_master_train

    # Indexing labels for Random Forest Algorithm
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexed_label")
    model = labelIndexer.fit(rf_init_data)
    rf_init_data = model.transform(rf_init_data)

    # Indexing features for Random Forest Algorithm
    featureIndexer = VectorIndexer(inputCol="features",
                                   outputCol="indexed_features",
                                   maxCategories=2)
    model = featureIndexer.fit(rf_init_data)
    rf_init_data = model.transform(rf_init_data)

    # Configures inbuilt Random Forest Classifier function with 500 trees,
    # max depth = 8 and 32 bins
    rf_init = RandomForestClassifier(labelCol="indexed_label",
                                     featuresCol="indexed_features",
                                     numTrees=500,
                                     impurity="gini",
                                     maxDepth=8,
                                     maxBins=32)

    rf_init_data.persist()  # Cache the data set
    rf_init_model = rf_init.fit(
        rf_init_data)  # Run the Random Forest Algorithm

    rf_init_data.unpersist()

    # Extract a list of feature importances from the output of the Random Forest
    # Algorithm with each element corresponding to a feature
    rf_init_varimp = np.sqrt(rf_init_model.featureImportances.toArray())

    # Creates a list containing the 6 most important features to be used later
    # to subset our entire data from 146 features to just 6!

    # Create a list containing the names of all features
    column_names = df_data.columns[:-2]

    #Creating a dictionary mapping feature names to their respective importances
    NameToImp = dict()
    for i in range(len(column_names)):
        key = column_names[i]
        value = rf_init_varimp[i]
        NameToImp[key] = value

    # Sorted list in reverse order according to the variable importances
    sorted_varimp = sorted(NameToImp.values(), reverse=True)

    # Collect importances of 6 most important features
    sorted_top_varimp = sorted_varimp[:6]

    # Sorted list of column names in reverse order according to varimp
    sorted_colnames = sorted(NameToImp, key=NameToImp.get, reverse=True)

    # Collect colnames of 6 most imp features
    col_names = sorted_colnames[:6]

    # Pulling data for most import 6 features
    df_data_new = df_data.select(
        df_data.u_msisdn.cast('string'), df_data.tag.cast('string'),
        *(col(c).cast("double").alias(c) for c in col_names))

    # Defines that the first column is the unique ID, the last one contains the labels and all the ones in between are the given features
    df_master_new = df_data_new.rdd.map(lambda r: Row(
        cust_id=r[0], label=r[1], features=Vectors.dense(r[2:]))).toDF()

    # Scale and normaize the features so that all features can be compared
    # and create a new column for the features
    scaler = StandardScaler(inputCol="features",
                            outputCol="scaled_features",
                            withStd=True,
                            withMean=True)

    # Compute summary statistics by fitting the StandardScaler
    scalerModel = scaler.fit(df_master_new)

    # Normalize each feature to have unit standard deviation.
    df_master_new = scalerModel.transform(df_master_new)

    #The old features have been replaced with their scaled versions and thus
    # we no longer care about the old, unbalanced features
    df_master_new = df_master_new.drop('features')

    # Randomly Split the data into a test and train set
    (df_master_train, df_master_test) = df_master_new.randomSplit([0.5, 0.5],
                                                                  seed=123)

    test_all = df_master_test

    sqlContext.registerDataFrameAsTable(df_master_train,
                                        "df_master_train_table")

    # Remove the negative labels as only the positive ones are important
    train_all = sqlContext.sql(
        'select * from df_master_train_table where label = 1')

    # Multiply feature values with corresponding importances
    m = ElementwiseProduct(scalingVec=Vectors.dense(sorted_top_varimp),
                           inputCol="scaled_features",
                           outputCol="scaled_weighted_features")

    train_all = m.transform(train_all)

    test_all = m.transform(test_all)

    sqlContext.dropTempTable("df_master_train_table")

    # Create a list of tasks containing tuples of number of neighbours and
    # cutoff frequencies to be passed to KNN algorithm
    number_of_neighbours = [250, 550, 750, 1000]
    popshared = 0.30
    num_indices = int(popshared * (test_all.count()))
    tasks = []
    for num_neighbour in number_of_neighbours:
        tasks = tasks + [(num_neighbour, num_indices)]

    # Partitioning the tasks for parallel processing
    tasksRDD = sc.parallelize(tasks, numSlices=len(tasks))
    tasksRDD.collect()

    train_pd = train_all.toPandas()
    test_pd = test_all.toPandas()

    train_pd['indices'] = train_pd.index
    test_pd['indices'] = test_pd.index

    # Converting features into SparseVector format
    l_train = list()
    for k in train_pd.scaled_weighted_features:
        l_train.append(
            Vectors.sparse(len(k),
                           [(i, j) for i, j in enumerate(k) if j != 0]))

    l_test = list()
    for k in test_pd.scaled_weighted_features:
        l_test.append(
            Vectors.sparse(len(k),
                           [(i, j) for i, j in enumerate(k) if j != 0]))

        # Converting to a numpy array
    knn_train = np.asarray(l_train)
    knn_test = np.asarray(l_test)
    # Broadcasting the training and test sets to all partitions
    train_broadcast = sc.broadcast(knn_train)
    test_broadcast = sc.broadcast(knn_test)

    # Calling K Nearest Neighbour search on each partition
    tree_type = "kd_tree"
    resultsRDD = tasksRDD.map(lambda nc: findNearestNeighbour(
        train_broadcast, test_broadcast, nc[0], nc[1], test_pd, tree_type))
    resultsRDD.cache()
    resultsRDD.count()

    resultsPD = resultsRDD.toDF().toPandas()

    resultsPD["popshared"] = popshared
    resultsPD = resultsPD.rename(columns={'_1': 'Recall'})
    resultsPD = resultsPD.rename(columns={'_2': 'Number of Neighbors'})

    bestResult = (resultsPD.sort_values(by=["Recall"], ascending=[0])).iloc[0]
    bestNN = int(bestResult["Number of Neighbors"])
    bestRecall = bestResult["Recall"]

    # saving the model info - varimp,recall,NN,col_names to model_path
    column_names = [i for i in col_names]
    model_info = sc.parallelize([{
        "varimp": sorted_top_varimp,
        "recall": bestRecall,
        "NN": bestNN,
        "col_names": column_names
    }])
    model_info.saveAsPickleFile(path=model_info_path)

    # saving the scaler model to model_path
    scalerModel.write().overwrite().save(model_scaler_path)

    # saving the train set to model_path
    df_master_new.rdd.saveAsPickleFile(path=model_train_set_path)
Ejemplo n.º 9
0
sqlContext.registerDataFrameAsTable(dfsub1, 'temp')
df2 = sqlContext.sql("""SELECT ActionGeo_CountryCode,
                               CAST(SQLDATE AS INTEGER), CAST(MonthYear AS INTEGER), CAST(Year AS INTEGER),
                               CASE WHEN Actor1Type1Code = '' AND Actor2Type1Code <> '' THEN Actor2Type1Code
				    ELSE Actor1Type1Code END AS Actor1Type1Code,
                               CAST(NumArticles AS INTEGER),
                               CAST(GoldsteinScale AS INTEGER),
                               CAST(AvgTone AS INTEGER)
                          FROM temp
                         WHERE ActionGeo_CountryCode <> '' AND ActionGeo_CountryCode IS NOT NULL
                            AND Actor1Type1Code <> '' AND Actor1Type1Code IS NOT NULL
                            AND NumArticles <> '' AND NumArticles IS NOT NULL
                            AND GoldsteinScale <> '' AND GoldsteinScale IS NOT NULL
                            AND AvgTone <> '' AND AvgTone IS NOT NULL""")

sqlContext.dropTempTable('temp')
sqlContext.registerDataFrameAsTable(df2, 'temp3')
sqlContext.cacheTable('temp3')

dfdaily = sqlContext.sql("""SELECT ActionGeo_CountryCode,
				   SQLDATE,
				   Actor1Type1Code,
				   SUM(NumArticles) AS NumArticles,
                                   ROUND(AVG(GoldsteinScale),0) AS GoldsteinScale,
			           ROUND(AVG(AvgTone),0) AS AvgTone
			      FROM temp3
			     GROUP BY ActionGeo_CountryCode,
				      SQLDATE,
				      Actor1Type1Code""")

dfmonthly = sqlContext.sql("""SELECT ActionGeo_CountryCode,
Ejemplo n.º 10
0
# In[40]:

sqlContext.createDataFrame(rdd).collect()

# In[41]:

df = sqlContext.createDataFrame(rdd, ['name', 'age'])

# In[42]:

df.collect()

# In[43]:

sqlContext.registerDataFrameAsTable(df, "table1")
sqlContext.dropTempTable("table1")

# In[44]:

sqlContext.getConf("spark.sql.shuffle.partitions")

# In[45]:

sqlContext.getConf("spark.sql.shuffle.partitions", u"10")

# In[46]:

sqlContext.setConf("spark.sql.shuffle.partitions", u"50")

# In[47]:
Ejemplo n.º 11
0
def main(sc):
    sqlContext = SQLContext(sc)
    input_path = ''
    output_path = ''
    model_path = ''
    model_info_path = model_path + ''
    model_scaler_path = model_path + ''
    model_train_set_path = model_path + ''

    #IMPORT THE CLIENT DATA
    client_data = sqlContext.read.format('com.databricks.spark.csv').options(
        header='true', inferschema='true').load(input_path)

    # Load the models and train data from Training Interface paths
    model_info = sc.pickleFile(model_info_path).flatMap(
        lambda x: x.items()).collectAsMap()
    scalerModel = StandardScalerModel.load(model_scaler_path)
    df_master_new = sc.pickleFile(model_train_set_path).toDF()

    col_names = model_info['col_names']
    sorted_top_varimp = model_info['varimp']

    # Pulling data for most import 6 features
    client_data = client_data.select(
        client_data.u_msisdn.cast('string'),
        *(col(c).cast("double").alias(c) for c in col_names))

    # Defines that the first column is the unique ID, the last one contains the labels and all the ones in between are the given features
    client_master = client_data.rdd.map(
        lambda r: Row(cust_id=r[0], features=Vectors.dense(r[1:]))).toDF()

    # Scale and normaize the features so that all features can be compared
    # and create a new column for the features
    client_scaler = StandardScaler(inputCol="features",
                                   outputCol="scaled_features",
                                   withStd=True,
                                   withMean=True)

    # Compute summary statistics by fitting the StandardScaler
    scalerModel = client_scaler.fit(client_master)

    # Normalize each feature to have unit standard deviation.
    client_master = scalerModel.transform(client_master)

    #The old features have been replaced with their scaled versions and thus
    # we no longer care about the old, unbalanced features
    client_master = client_master.drop('features')

    sqlContext.registerDataFrameAsTable(df_master_new, "df_master_train_table")

    # Remove the negative labels as only the positive ones are important
    train_all_client = sqlContext.sql(
        'select * from df_master_train_table where label = 1')

    # Multiply feature values with corresponding importances
    m = ElementwiseProduct(scalingVec=Vectors.dense(sorted_top_varimp),
                           inputCol="scaled_features",
                           outputCol="scaled_weighted_features")

    train_all_client = m.transform(train_all_client)

    client_master = m.transform(client_master)

    sqlContext.dropTempTable("df_master_train_table")

    nn = 1000
    popshared = 0.30
    num_indices = (int)(popshared * client_master.count())
    tree_type = "kd_tree"
    nn, popshared, num_indices

    train_pd = train_all_client.toPandas()
    test_pd = client_master.toPandas()

    freq_table = findNearestNeighbour_client(train_pd, test_pd, nn,
                                             num_indices, tree_type)

    sqlContext.createDataFrame(freq_table[['cust_id', 'freq']], ).repartition(
        1).write.format("com.databricks.spark.csv").save(output_path)
Ejemplo n.º 12
0
 def drop_table(self, table_name):
     u" 删除 table表"
     SQLContext.dropTempTable(table_name)