コード例 #1
0
def main(argv):

    #se instancia el contexto de spark.
    sc = SparkContext(appName="KMeans-Clustering-dhoyoso-dsernae")
    #se inicia sesion en spark.
    spark = SparkSession(sc)
    #se guarda el lenguaje a partir del cual se quitaran las stop words.
    language = argv[4]  #"spanish"
    #se guarda la ruta para la salida de los clusters.
    pathout = argv[3]
    #se guarda la ruta de la cual se leeran los archivos.
    path = argv[2]  #"hdfs:///user/dhoyoso/datasets/dataset/"
    #se guarda el numero de clusters que se desea hacer.
    k = int(argv[1])  #4
    #se sacan los archivos a procesar a partir de la ruta.
    files = sc.wholeTextFiles(path)
    #se crea la estructura del dataframe; 2 columnas una para la ruta y otra para el texto.
    schema = StructType([
        StructField("path", StringType(), True),
        StructField("text", StringType(), True)
    ])
    #se crea el dataframe a partir de la estructura y los archivos.
    df = spark.createDataFrame(files, schema)
    #se tokeniza el texto usando la clase de Ml tokenizer.
    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    #se le dice al stop words remover que idioma es el que estamos tratando.
    StopWordsRemover.loadDefaultStopWords(language)
    #se remueven las stopwords de los tokens.
    stopWords = StopWordsRemover(inputCol="tokens",
                                 outputCol="stopWordsRemovedTokens")
    #se hace el hashing tf de los tokens restantes.
    hashingTF = HashingTF(inputCol="stopWordsRemovedTokens",
                          outputCol="rawFeatures",
                          numFeatures=2000)
    #se hace el idf de la salida del hashingTF
    idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=1)
    #se inicializa el kmeans con el idf y el k deseado.
    kmeans = KMeans(k=k)
    #creacion del mapa de transformaciones.
    pipeline = Pipeline(stages=[tokenizer, stopWords, hashingTF, idf, kmeans])
    #inserta el dataframe como el inicio de las transformaciones
    model = pipeline.fit(df)
    #ejecuta las trasformaciones mapeadas y guarda el resultado
    results = model.transform(df)
    results.cache()
    #se corta la ruta para dejar solo el nombre y su respectivo cluster(prediction).
    split_col = split(results['path'], '/')
    results = results.withColumn('docname', split_col.getItem(7))
    df = results.select("docname", "prediction")

    #se agrupan los documentos del mismo cluster en cluster_docs_list y se guardan en el path de salida como un json.
    grouped = df.groupBy(['prediction']).agg(
        collect_list("docname").alias('cluster_docs_list'))
    grouped.coalesce(1).write.json(path=pathout, mode="overwrite")
コード例 #2
0
ファイル: SparkTest.py プロジェクト: jiasy/PY_Service
class SparkTest(BaseService):
    def __init__(self, sm_):
        super().__init__(sm_)
        self.testJsonFilePath = fileUtils.getPath(self.resPath,
                                                  "testJsonStr.json")
        self.conf: SparkConf = None
        self.sc: SparkContext = None
        self.spark: SparkSession = None
        self.sqlCtx: SQLContext = None
        # self.writeJsonToResPath()
        self.initSpark()

    def writeJsonToResPath(self):
        self.testJsonStr = '{"result":1245186,"roomCardCount":1000000,"battleId":0,"roomId":0,"marqueeVersion":{"low":1,"high":0,"unsigned":false},"newMail":null,"newLimitedCostlessActivity":false,"noticeVersion":{"low":0,"high":0,"unsigned":false},"activityInfo":[{"id":300001,"startTime":{"low":-576284416,"high":345,"unsigned":false},"endTime":{"low":-72284416,"high":345,"unsigned":false}},{"id":300005,"startTime":{"low":-224153600,"high":349,"unsigned":false},"endTime":{"low":438245400,"high":350,"unsigned":false}},{"id":300008,"startTime":{"low":-2140022784,"high":353,"unsigned":false},"endTime":{"low":-238493672,"high":133356,"unsigned":false}},{"id":300004,"startTime":{"low":-1994684416,"high":345,"unsigned":false},"endTime":{"low":-1908285416,"high":345,"unsigned":false}},{"id":300002,"startTime":{"low":131409920,"high":355,"unsigned":false},"endTime":{"low":217808920,"high":355,"unsigned":false}},{"id":300007,"startTime":{"low":-2140022784,"high":353,"unsigned":false},"endTime":{"low":-584823784,"high":353,"unsigned":false}},{"id":300000,"startTime":{"low":-576284416,"high":345,"unsigned":false},"endTime":{"low":46435072,"high":368,"unsigned":false}}],"buttonValue":13,"timeStamp":{"low":1316055502,"high":355,"unsigned":false},"clubId":null,"createTime":{"low":1037369829,"high":355,"unsigned":false},"connGroup":"c74d97b01eae257e44aa9d5bade97baf","isIdentityVerify":false,"isAgency":false,"agtWebUrl":"","combatId":0,"area":10002,"displayId":5198814,"mttStartTime":{"low":0,"high":0,"unsigned":false},"ticket":0,"phone":"","notifyRedDot":[],"pushRegisterId":""}'
        fileUtils.writeFileWithStr(self.testJsonFilePath, self.testJsonStr)

    def initSpark(self):
        # 集群 URL : local 这个特殊值可以让 Spark 运行在单机单线程上而无需连接到集群
        _clusterType = "local"
        # 应用名 : appName 当连接到一个集群时,这个值可以帮助你在集群管理器的用户界面中找到你的应用。
        _appName = self.app.appName
        self.conf = SparkConf().setMaster(_clusterType).setAppName(_appName)
        self.sc = SparkContext(conf=self.conf)
        self.spark = SparkSession \
            .builder \
            .appName("Python Spark SQL basic example") \
            .config("spark.some.config.option", "some-value") \
            .getOrCreate()
        self.sqlCtx = SQLContext(self.sc)
        global blankLines
        blankLines = self.sc.accumulator(0)

    def create(self):
        super(SparkTest, self).create()
        # self.test_parallelize()
        # _testTextRDD = self.test_textFile()
        # self.test_RDD(_testTextRDD)
        # self.test_map()
        # self.test_flatMap()
        # self.test_createPairRDD()
        # self.test_aggregate()
        self.test_jsonStrToDataFrame()
        # self.test_jsonFileWrite()
        # self.test_sparkSql()
        # self.test_createSchema()
        # self.test_accumulator()
        # self.test_sparkStreaming()
        # self.test_presto()

    def destroy(self):
        super(SparkTest, self).destroy()

    def test_parallelize(self):
        # 一个区内进行RDD转化
        self.sc.parallelize(["pandas", "i like pandas"])
        # 分成两个区
        self.sc.parallelize([1, 2, 3, 4], 2)

    def test_textFile(self):
        _testTextFilePath = fileUtils.getPath(self.app.resPath, "README.md")
        # Spark 的 RDD 包含两种操作
        _testTextRDD = self.sc.textFile(_testTextFilePath)
        return _testTextRDD

    def test_RDD(self, targetRDD_):
        # 向Spark传递函数
        def pythonInLine(line_):
            # 确保 filter 中没有 self 之类的引用,否者,引用会被序列化,传递给Spark的成本增加。
            return "Python" in line_

        # 转化操作 (transformation)
        _pythonInLineRDD = targetRDD_.filter(pythonInLine)
        _starInLineRDD = targetRDD_.filter(lambda line: "* " in line)
        # 返回包含两个RDD所有元素的新RDD,可能有重复元素
        _unionRDD = _pythonInLineRDD.union(_starInLineRDD)
        # 去重
        _unionRDD = _unionRDD.distinct()
        # 让 Spark 把这个 RDD 缓存吗,cache() 与使用默认存储级别调用 persist() 是一样的。
        _unionRDD.cache()
        # 行动操作(action): Spark 只会 惰性 计算这些 RDD
        # 每当我们调用一个新的行动操作时,整个 RDD 都会从头开始计算
        # 只有第一次在一个行动操作中用到时,才会真正计算 (count 和 first 都是 action)
        _count = _unionRDD.count()
        self.app.info.log("_count = " + str(_count))
        _first = _unionRDD.first()
        # print("_first = " + str(_first))
        _take5 = _unionRDD.take(5)  # 取5条
        # print("_take5 = " + str(_take5))
        _all = _unionRDD.collect()  # 取所有<注意数据大小,别给内存弄爆了>

        # 返回两个RDD都中都有的元素组成的RDD
        _intersectionRDD = _pythonInLineRDD.intersection(_starInLineRDD)
        _same = _intersectionRDD.collect()

    # RDD 每一个元素处理之后,获得新的 RDD
    def test_map(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        _numsRDD = self.sc.parallelize([1, 2, 3, 4])
        # map() 接收一个函数, 把这个函数用于 RDD 中的每个元素, 将函数的返回结果作为结果
        _squaredRDD = _numsRDD.map(lambda x: x * x)
        # 输出每一个元素,平方数
        for _num in _squaredRDD.collect():
            print("%i " % _num)

    # RDD 每一个元素处理之后,变成多个元素,然后再将所有元素构成新的 RDD
    def test_flatMap(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        _lines = self.sc.parallelize(["hello world", "hi"])
        # flatMap 得到了一个由各列表中的元素组成的 RDD, 而不是一个由列表组成的 RDD
        _words = _lines.flatMap(lambda line: line.split(" ")).collect()
        for _word in _words:
            print("%s " % _word)
        # 读取json文件,返回一个二元组(文件路径,文件内容)
        _jsonRDD = self.sc.wholeTextFiles(self.testJsonFilePath)

        # 读取每一行json信息,将二元组的第二项,作为字符串解析成Json对象,将其中的 activityInfo 作为新 RDD 的元素。
        def _getActivityInfoFunc(jsonInfoKV_):
            _jsonValue = jsonInfoKV_[1]
            _activityInfoDict = json.loads(_jsonValue)["activityInfo"]
            return _activityInfoDict

        _json_activityInfo_RDD = _jsonRDD.flatMap(
            lambda _jsonInfoKV: (_getActivityInfoFunc(_jsonInfoKV)))
        # 去重
        _json_activityInfo_RDD.distinct()
        # RDD 转 DF
        _json_activityInfo_DF = self.spark.createDataFrame(
            _json_activityInfo_RDD)
        # 创建临时表
        _json_activityInfo_DF.registerTempTable("activityInfo")
        _resultsRDD = self.sqlCtx.sql(
            "SELECT startTime.low,endTime.low FROM activityInfo WHERE id = 300001L"
        )

        for _result in _resultsRDD.collect():
            print("_result = " + str(_result))

    # 创建 Pair RDD
    def test_createPairRDD(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        _lines = self.sc.parallelize(
            ["key1 value1", "key2 value2", "key2 value22", "key3 value3"])
        # 转换成键值对儿
        _pairs = _lines.map(lambda _item:
                            (_item.split(" ")[0], _item.split(" ")[1]))
        # 值字符串长6以内的保留
        _pairs = _pairs.filter(lambda _keyValue: len(_keyValue[1]) <= 6)
        # 输出满足条件的每一个键值
        for (_key, _value) in _pairs.collect():
            print(str(_key) + " = " + str(_value))

    def test_aggregate(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        # 统计单词出现的次数
        _lines = self.sc.parallelize(
            ["hello value1", "hello value2", "hi value22", "f**k value3"])
        # 获取每个词 ["hello","value1","hello","value2","hi","value22","f**k","value3"]
        _words = _lines.flatMap(lambda _line: _line.split(" "))
        # 每个词变成(词,1)元组(pairRDD)
        _wordAndOnes = _words.map(lambda _word: (_word, 1)).cache()
        # pairRDD 二元组,第一元做Key,第二元做值
        _wordReduce = _wordAndOnes.reduceByKey(
            lambda _valueReduce, _valueNext: _valueReduce + _valueNext)
        # 输出 次 和个数
        for _key, _value in _wordReduce.collect():
            print(str(_key) + " = " + str(_value))

        # aggregate 的方法计算 单词出现次数
        _wordAgg = _wordAndOnes.aggregateByKey(
            0,  # 初始值
            (lambda _valueReduce, _valueNext: _valueReduce + _valueNext
             ),  # RDD 中的元素合并起来放入累加器
            (lambda _reduce, reduceNext: _reduce + reduceNext)  # 累加器两两合并
        )
        # 输出 次 和个数
        for _key, _value in _wordAgg.collect():
            print(str(_key) + " = " + str(_value))

        # 计算平均值
        _numsRDD = self.sc.parallelize([1, 2, 3, 4])
        _sumInfo = _numsRDD.aggregate(
            (0, 0),  # 初始值(累加值,计数)
            (lambda _sumReduce, _value:
             (_sumReduce[0] + _value, _sumReduce[1] + 1)
             ),  # 将每一个value进行累加,计数器累计
            (lambda _sumReduceReduce, _sumReduceNext:
             (_sumReduceReduce[0] + _sumReduceNext[0], _sumReduceReduce[1] +
              _sumReduceNext[1]))  # 累加器再一次合并
        )
        _average = _sumInfo[0] / float(_sumInfo[1])
        print("_average = " + str(_average))

    def test_jsonStrToDataFrame(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        _jsonDataFrame = self.spark.read.json(self.testJsonFilePath)
        _jsonDataFrame.printSchema()
        _jsonDataFrame.show()
        _activityInfo = _jsonDataFrame.selectExpr("activityInfo")
        _activityInfo.show()

    def test_jsonFileWrite(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        # 写入文件的路径
        _writeToPath = fileUtils.getPath(self.app.resPath, "activityInfo.json")
        # 产生一个元组列表 (文件路径,文件内容)
        jsonRDD = self.sc.wholeTextFiles(self.testJsonFilePath)
        for _jsonKeyValue in jsonRDD.collect():
            print("_jsonKeyValue = " + str(_jsonKeyValue))
        # 元组列表的每一项,取其中的 文件内容,转换成 json字典对象,构成新的RDD
        # 字典对象RDD,过滤,将包含键的元素,构成新的RDD
        # 将元素中的字段取出,构成新的RDD
        jsonDataFilter = jsonRDD \
            .map(lambda _jsonKeyValue: json.loads(_jsonKeyValue[1])) \
            .filter(lambda _jsonDict: _jsonDict["activityInfo"]) \
            .map(lambda _jsonDict: _jsonDict["activityInfo"])
        # activityInfo 字段内容构成的RDD
        for _jsonDataFilter in jsonDataFilter.collect():
            print("_jsonDataFilter = " + str(_jsonDataFilter))
        # 如果,没有写过的话,就写一份
        if not os.path.exists(_writeToPath):
            jsonDataFilter.saveAsTextFile(_writeToPath)

    #
    def test_sparkSql(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        _tweets = self.sqlCtx.read.json(self.testJsonFilePath)
        _tweets.printSchema()
        _tweets.show()
        _tweets.registerTempTable("tempTable")
        # 即便过滤出来的某一个属性,其查询的结构也是一样的。都需要从activityInfo这个顶层属性开始做字段查找。
        # _activityInfo = _tweets.selectExpr("activityInfo")
        # _activityInfo.show()
        # _activityInfo.registerTempTable("tempTable")
        _resultsRDD = self.sqlCtx.sql(
            "SELECT activityInfo.startTime.low,activityInfo.endTime.low FROM tempTable"
        )
        for _result in _resultsRDD.collect():
            print("_result = " + str(_result))

    def test_createSchema(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        # 数据
        _jsonDatas = [{
            'a': 'aaa',
            'b': 'bbb',
            'c': 'ccc'
        }, {
            'a': 'aaaa',
            'b': 'bbbb',
            'c': 'cccc',
            'd': 'dddd',
            'e': 'eeee'
        }]
        _jsonDatas = [json.dumps(_jsonDict) for _jsonDict in _jsonDatas]

        # 已知结构
        schema = ['a', 'b', 'c', 'd']
        fields = [
            StructField(_fieldName, StringType(), True)
            for _fieldName in schema
        ]
        schema = StructType(fields)

        rdd = self.sc.parallelize(_jsonDatas)
        # 已知 结构 会被保留,未知结构会被抛弃
        df = self.sqlCtx.read.schema(schema).json(rdd)
        for data in df.collect():
            print("data = " + str(data))
        df.registerTempTable("tempTable")
        _resultsRDD = self.sqlCtx.sql("SELECT c,d FROM tempTable")
        for _result in _resultsRDD.collect():
            print("_result = " + str(_result))

    def test_accumulator(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")

        def accumulatorFunc():
            global blankLines
            blankLines += 1

        for i in range(10):
            accumulatorFunc()

        global blankLines
        print("blankLines = " + str(blankLines))

    def test_sparkStreaming(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        self.getSubClassObject("SparkStreaming")
        self.getSubClassObject("Kafka")
        # self.showCurrentBaseObejctsInfo()
        self.sparkStreaming.destroy()
        self.kafka.destroy()
        # self.showCurrentBaseObejctsInfo()

    def test_presto(self):
        print(self.className + " - " +
              pyUtils.getCurrentRunningFunctionName() + "------------------")
        self.getSubClassObject("Presto")
        self.presto.doTest()
        self.presto.destroy()
コード例 #3
0
ファイル: clustering.py プロジェクト: emonto15/KMeansSpark
#!/usr/bin/env python
import sys
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer, StopWordsRemover
from pyspark.ml.clustering import KMeans
#Check if all the params were passed
if (len(sys.argv) > 5):
    #Setup the sparkContext
    sc = SparkContext(appName="SparkClustering-emonto15-dperezg1")
    spark = SparkSession(sc)
    #Read from hdfs and save using a schema (path,text)
    files = sc.wholeTextFiles("hdfs://" + sys.argv[1])
    schema = StructType([
        StructField("path", StringType(), True),
        StructField("text", StringType(), True)
    ])
    df = spark.createDataFrame(files, schema)
    #Divide the text into an array of words
    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    #Setup the language to remove the stopwords
    StopWordsRemover.loadDefaultStopWords(sys.argv[4])
    #Read from column tokens (which is the output of the tokenizer object) and save a new array of words without the stopwords
    stopWords = StopWordsRemover(inputCol="tokens",
                                 outputCol="stopWordsRemovedTokens")
    #Creates a hash of each word and the frecuency on each document and only takes the number of words established on the numFeatures parameter
    hashingTF = HashingTF(inputCol="stopWordsRemovedTokens",
                          outputCol="rawFeatures",
コード例 #4
0
        }

def process(record,schema):
    split_text = []
    if schema == 'OSM':
        d = "1-"
        for i,e in enumerate(record.split("\n1-")):
            if i > 0:
                split_text.append(d+e)
            else:
                split_text.append(e)
    else:
        split_text = [row for row in record.split("\n") if row]
    return split_text

f = sc.wholeTextFiles('%(sourcedir)s/RAW/ingest_date=%(ingestdate)s' % options)
schema = options['schema']
stage1 = f.collect()
split_text = process(stage1[0][1],schema)
header = split_text[0].split("~^")
split_text.pop(0)
if split_text:
    rows = [row.split("~^") for row in split_text if row]

df_writer = sqlContext.createDataFrame(rows,header)

df_writer.registerTempTable("df_writer")
sqlContext.sql("create database if not exists ingest_%(db)s" % options)
sqlContext.sql("create table if not exists ingest_%(db)s.%(schema)s_%(table)s_schema \
                    row format serde \
                    'org.apache.hadoop.hive.serde2.avro.AvroSerDe' \
コード例 #5
0
ファイル: code.py プロジェクト: mamup11/bi--data
#https://spark.apache.org/docs/2.2.0/ml-pipeline.html
#https://stackoverflow.com/questions/35769489/adding-the-resulting-tfidf-calculation-to-the-dataframe-of-the-original-document
#https://spark.apache.org/docs/2.2.0/ml-features.html#tf-idf
if len(sys.argv) != 3:
    print("Error!")
    print("Usage: " + sys.argv[0] + " PATH " + "NumberOfClusters")
    exit(-1)

path = str(sys.argv[1])
cluster_number = int(sys.argv[2])

sc = SparkContext('local')
spark = SparkSession(sc)

#Ejemplo path= hdfs:///user/mmurill5/datasets/miniGutenberg
files = sc.wholeTextFiles(path)
documents = spark.createDataFrame(files, ["doc_id", "doc_text"])
#documents.printSchema()

df = (documents.rdd.map(
    lambda x: (x.doc_id, x.doc_text.split(" "))).toDF().withColumnRenamed(
        "_1", "doc_id").withColumnRenamed("_2", "text"))

htf = MLHashingTF(inputCol="text", outputCol="tf")
tf = htf.transform(df)
#tf.select("text", "tf").show()

idf = MLIDF(inputCol="tf", outputCol="features")
tfidf = idf.fit(tf).transform(tf)
#tfidf.select("tf", "features").show()