def sql_hive_context_example(spark):
    
    # create hive context object.
    hive_ctx = HiveContext(spark.sparkContext)

    # createDataFrame
    l = [('Alice', 18), ('Bob', 20), ('Charley', 22)]
    df = hive_ctx.createDataFrame(l, ('name', 'age'))
    print("createDataFrame API finished")

    # registerDataFrameAsTable 
    hive_ctx.registerDataFrameAsTable(df, "table1")
    print("registerDataFrameAsTable API finished")

    # sql
    tmp_df = hive_ctx.sql("select * from table1")
    tmp_df.show()
    print("sql API finished")

    # table
    tmp_df = hive_ctx.table("table1")
    tmp_df.show()
    print("table API finished")

    # tableNames
    table_names = hive_ctx.tableNames()
    print(table_names)
    print("tableNames API finished")

    # tables
    tables = hive_ctx.tables()
    print(tables)
    print("tables API finished")

    # range
    tmp_df = hive_ctx.range(1,10,2)
    tmp_df.show()
    print("range API finished")

    # dropTempTable
    hive_ctx.dropTempTable("table1")
    table_names = hive_ctx.tableNames()
    print(table_names)
    print("dropTempTable API finished")

    # cacheTable & uncacheTable & clearCache
    df = hive_ctx.range(1,10,2)
    hive_ctx.registerDataFrameAsTable(df, "table")
    hive_ctx.cacheTable("table")
    hive_ctx.uncacheTable("table")
    hive_ctx.clearCache()
    print("cacheTable & uncacheTable & clearCache API finished")

    # createExternalTable

    # newSession

    # registerFunction
    # Deprecated in 2.3.0. Use :func:`spark.udf.register` instead

    # registerJavaFunction
    # Deprecated in 2.3.0. Use :func:`spark.udf.registerJavaFunction` instead

    # setConf & getConf
    hive_ctx.setConf("key1", "value1")
    value = hive_ctx.getConf("key1")
    print(value)
    print("setConf & getConf API finished")

    # refreshTable
    # Exception: An error occurred while calling o26.refreshTable:
    # Method refreshTable([class java.lang.String]) does not exist
    
    print("Finish running HiveContext API")
Beispiel #2
0
class HqlSpark():
    # init Spark sql
    def __init__(self, language='cn'):
        self.conf = SparkConf().setAppName("HqlSparkAPP").setSparkHome(
            "/home/spark").set("spark.driver.allowMultipleContexts", "true")
        self.sc = SparkContext(conf=self.conf)
        self.sql = HiveContext(self.sc)
        self.sql.sql('use anserchapp')
        self.curlan = language
        self.curDataWord = None
        # self.loadWord_state = False

    #get current language word
    def getAllWord(self, language='cn'):
        if self.curlan == language and self.curDataWord:
            return
        self.curlan = language
        sql_s = 'select * from searchapp_%s limit 100' % self.curlan
        self.curDataWord = self.sql.sql(sql_s)

    def refreshTable(self, tableName):
        self.sql.refreshTable(tableName)

    #create table
    def createTable(self, sql_sentence, tableName):
        # table_name = sql_sentence.split(' ')[2]
        self.sql.sql(sql_sentence)
        self.sql.refreshTable(tableName)
        print 'create table success'

    #insert data into table
    def insertData(self, sql_sentence):
        self.sql.sql(sql_sentence)

    # insert data into table from data_struct
    def insertDataFromStruct(self,
                             data,
                             tableName='searchapp_',
                             d_type='cn',
                             state=False):  #data tuple or list list   data,
        # rdd = self.sc.parallelize(data)
        if d_type == '':
            in_data = self.sql.createDataFrame(data, als._categry_shame)
        elif d_type == 'hint':
            in_data = self.sql.createDataFrame(data, als.hintWord_shame)
            d_type = ''
        else:
            in_data = self.sql.createDataFrame(data, als.searchApp_shame)
        # final_data = in_data
        if state:
            in_data.saveAsTable(tableName=tableName + d_type,
                                Source='metastore_db',
                                mode='append')  #   append  overwrite
        else:
            in_data.saveAsTable(tableName=tableName + d_type,
                                Source='metastore_db',
                                mode='overwrite')

    # delete table
    def deleteDataFromTable(self, table='searchapp_', d_type='ch'):
        sql_sentence = 'delete from ' + table + d_type
        self.sql.dropTempTable(table + d_type)
        self.sql.refreshTable(table + d_type)

    def showTales(self):
        table_list = []
        tables = self.sql.sql('show tables').collect()
        for table in tables:
            table_list.append(table['tableName'])
        return table_list

    def getData(self, sql_hive):
        datas = self.sql.sql(sql_hive).collect()
        return datas

    #according input words find hintword from table hintword
    def selectHintWord(self, base_wordFr):
        hintWord = self.sql.sql('select word,hintWord from hintword')
        word = hintWord.join(base_wordFr,
                             hintWord.hintWord == base_wordFr.word,
                             'outer').select(hintWord.word).distinct()
        word_news = self.curDataWord.join(
            word, word.word == self.curDataWord.word,
            'outer').select(self.curDataWord.word, 'priority', 'searchApp',
                            'searchCount', 'genre').distinct()
        word_news = word_news.dropna(how='any')
        return word_news

    #according to appId find word from searchapp
    def selectAppIdWord(self, appIds):
        result = None
        for appId in appIds:
            if result == None:
                result = self.curDataWord.filter(
                    functions.array_contains(self.curDataWord.searchapp,
                                             appId)).select(
                                                 'word', 'priority',
                                                 'searchApp', 'searchCount',
                                                 'genre').distinct()
            res = self.curDataWord.filter(
                functions.array_contains(self.curDataWord.searchapp,
                                         appId)).select(
                                             'word', 'priority', 'searchApp',
                                             'searchCount',
                                             'genre').distinct()
            result = result.unionAll(res)
            word = result.select('word')
            result = result.dropna(how='any')
        return result, word

    #according to genre id find word from searchApp
    def selectGenreWord(self, genreIds):
        result = None
        for gId in genreIds:
            if result == None:
                result = self.curDataWord.filter(
                    functions.array_contains(self.curDataWord.genre,
                                             gId)).select(
                                                 'word', 'priority',
                                                 'searchApp', 'searchCount',
                                                 'genre').distinct()
            res = self.curDataWord.filter(
                functions.array_contains(self.curDataWord.genre,
                                         gId)).select('word', 'priority',
                                                      'searchApp',
                                                      'searchCount',
                                                      'genre').distinct()
            result = result.unionAll(res)
        return result

    # get all word for analysis
    def getAnalysisWords(self, appIds, genreIds):
        if appIds == None or len(appIds) <= 0:
            return None
        appWord, word = self.selectAppIdWord(appIds)
        genreWord = None
        thinkWord = None
        if genreIds != None and len(genreIds) > 0:
            genreWord = self.selectGenreWord(genreIds)
        if word and word.count() > 0:
            thinkWord = self.selectHintWord(word)

        if appWord and genreWord and thinkWord:
            appWord = appWord.unionAll(genreWord)
            appWord = appWord.unionAll(thinkWord)
            return appWord.distinct()
            # return appWord.unionAll(genreWord).unionAll(thinkWord).distinct()
        elif appWord and genreWord:
            return appWord.unionAll(genreWord).distinct()
        elif appWord and thinkWord:
            appWord = appWord.unionAll(thinkWord)
            return appWord.distinct()
        elif genreWord and thinkWord:
            genreWord = genreWord.unionAll(thinkWord)
            return genreWord.distinct()
        elif appWord:
            return appWord.distinct()
        elif genreWord:
            return genreWord.distinct()
        else:
            return thinkWord.distinct()

    #build Matrix
    def buildMatrix(self, words):
        class_all = self.sql.sql(
            "select genreID from category order by genreID desc")
        c_genres = class_all.collect()
        genres = {}
        i = 0
        for c in c_genres:
            genres.setdefault(c.genreID, i)
            i += 1

        datas = words.select('genre').collect()
        mlength = len(c_genres)
        nlength = len(datas)
        Matrix = numpy.zeros((nlength, mlength))
        num = 0
        print len(Matrix)
        for data in datas:
            for ge in data.genre:
                Matrix[num][genres.get(ge)] = 1
            num += 1
        return Matrix

    #get Input data
    def getInPut(self, appIds, genreIds):
        words = self.getAnalysisWords(appIds, genreIds)
        return self.buildMatrix(words)

    # k_means analysis
    def spark_means(self, Matrix, Kcluster=2, MaxIterations=10, runs=10):
        cluster_data = self.sc.parallelize(Matrix)
        trains = KMeans().train(cluster_data, Kcluster, MaxIterations, runs)
        results = trains.predict(cluster_data).collect()
        return results

    #combine word
    def combine_data(self, words=None, result=None):
        len_re = len(result)
        len_w = words.count()
        if len_re != len_w:
            print 'word num :', len_w, ' is not equal result num:', len_re
        if len_re < len_w:
            words = self.sql.createDataFrame(words.take(len_re))

        else:
            result = result[0:len_w]
            print words.count(), len(result)
        result = map(lambda x: str(x), result)
        cluster_re = self.sc.parallelize(result, 1)
        # print cluster_re.collect(),words.map(list).count()
        re = words.map(list).repartition(1).zip(cluster_re).map(lambda p: Row(word=p[0][0], priority=int(p[0][1]),\
         searchcount=int(p[0][3]),cluster=p[1]))
        cluster_sha = self.sql.createDataFrame(re)
        # cluster_sha.show()
        return cluster_sha

    # select Class Word
    def selectWord(self, cluster_sha, top_K=2):
        df = cluster_sha
        select_par = df.groupBy('cluster').agg({
            'searchcount': 'avg',
            'priority': 'avg'
        }).collect()
        ClusterNum = len(select_par)
        clusterWord = []
        for line in select_par:
            cluster_df = df.filter(df.cluster == line[0]).select(
                'word', 'priority', 'searchcount')
            ClassWord = cluster_df.filter(
                cluster_df.searchcount > line[1]).select('word', 'priority')
            ClassWord = ClassWord.filter(
                ClassWord.priority >= line[2]).select('*').limit(top_K)

            KeyWords = cluster_df.filter(
                cluster_df.searchcount < line[1]).select('word', 'priority')
            KeyWords = KeyWords.filter(
                KeyWords.priority >= line[2]).select("*").limit(top_K)
            cluster = {
                'cluster_id': line[0],
                'classWord': ClassWord.toJSON().collect(),
                'keyWord': KeyWords.toJSON().collect()
            }
            clusterWord.append(cluster)
        result = {'ClusterNum': ClusterNum, 'AllCluster': clusterWord}
        return result
    REGION as region,
    DIVISION as division,
    STATE as state,
    COUNTY as county,
    COUNTYCC as countycc,
    COUNTYSC as countysc,
    COUSUB as cousub,
    COUSUBCC as cousubcc,
    COUSUBSC as cousubsc,
    PLACE as place,
    PLACECC as placecc,
    AREALAND as area_land,
    AREAWATR as area_water,
    FUNCSTAT as func_stat,
    POP100 as pop_count,
    HU100 as housing_unit,
    INTPTLAT as latitude,
    INTPTLON as longitude,
    STATENS as state_ansi,
    COUNTYNS as county_ansi,
    COUSUBNS as county_sub_ansi,
    PLACENS as place_ansi
    FROM us_census_dataframe
    """)

us_census.registerTempTable("us_census")
us_census.write.save(output_file, mode='overwrite')

sqlContext.dropTempTable("us_census_dataframe")
print("Completed")