Python Tokenizerの例、pyspark.ml.feature.Tokenizer Pythonの例

コード例 #1

0

ファイルを表示

ファイル: kmean_model.py プロジェクト: ohliumliu/flash_deals_c9

def fit_kmeans(spark, products_df):
    step = 0

    step += 1
    tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer")

    step += 1
    stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords")

    step += 1
    tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16)

    step += 1
    idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf")

    step += 1
    normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer")

    step += 1
    kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20)

    kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans])

    model = kmeans_pipeline.fit(products_df)
    words_prediction = model.transform(products_df)
    model.save("./kmeans")  # the whole machine learning instance is saved in a folder
    return model, words_prediction

コード例 #2

0

ファイルを表示

ファイル: topic_modelling_scikit.py プロジェクト: rjshanahan/Text_Analytics_Topic_Modelling

def token(dataframe, in_col, out_col):
    
    tokenizer = Tokenizer(inputCol=in_col, outputCol=out_col)
    dataframe = tokenizer.transform(dataframe)
    
    dataframe.printSchema()
    
    return dataframe

コード例 #3

0

ファイルを表示

ファイル: views.py プロジェクト: JallyHe/networkPublicOpinionAnalysisSystem

def textPredict(request):
    """6.文本聚类，热度预测"""
    label = request.POST['label']
    title = request.POST['title']

    conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    """处理数据集，生成特征向量"""
    dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
    print(dfTitles.dtypes)
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(dfTitles)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    rescaledData.show()
    for features_label in rescaledData.select("features", "rawFeatures").take(3):
        print(features_label)
    """决策树模型培训"""
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
    (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
    dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    model = pipeline.fit(trainingData)
    """模型测试"""
    predictions = model.transform(testData)
    predictions.show()
    predictions.select("prediction", "indexedLabel", "features").show(5)
    """用户数据测试，单个新闻测试"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    print("==================================================")
    myprediction.show()
    resultList = convertDfToList(myprediction)

    """模型评估"""
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g " % (1.0 - accuracy))

    treeModel = model.stages[2]
    print(treeModel)

    sc.stop()
    return render(request,{'resultList':resultList})

コード例 #4

0

ファイルを表示

ファイル: dbpedia_processing.py プロジェクト: FNDaily/amazon-sagemaker-examples

def main():
    spark = SparkSession.builder.appName("DBPediaSpark").getOrCreate()

    args = getResolvedOptions(sys.argv, ['S3_INPUT_BUCKET',
                                         'S3_INPUT_KEY_PREFIX',
                                         'S3_OUTPUT_BUCKET',
                                         'S3_OUTPUT_KEY_PREFIX',
                                         'S3_MODEL_BUCKET',
                                         'S3_MODEL_KEY_PREFIX'])

    # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
    spark.sparkContext._jsc.hadoopConfiguration().set("mapred.output.committer.class",
                                                      "org.apache.hadoop.mapred.FileOutputCommitter")
    
    # Defining the schema corresponding to the input data. The input data does not contain the headers
    schema = StructType([StructField("label", IntegerType(), True), 
                         StructField("title", StringType(), True), 
                         StructField("abstract", StringType(), True)])
    
    # Download the data from S3 into two separate Dataframes
    traindf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'],
                                                   'train.csv')), header=False, schema=schema, encoding='UTF-8')
    validationdf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'],
                                                          'test.csv')), header=False, schema=schema, encoding='UTF-8')

    # Tokenize the abstract column which contains the input text
    tokenizer = Tokenizer(inputCol="abstract", outputCol="tokenized_abstract")

    # Save transformed training data to CSV in S3 by converting to RDD.
    transformed_traindf = tokenizer.transform(traindf)
    transformed_train_rdd = transformed_traindf.rdd.map(lambda x: (x.label, x.tokenized_abstract))
    lines = transformed_train_rdd.map(csv_line)
    lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'train'))

    # Similar data processing for validation dataset.
    transformed_validation = tokenizer.transform(validationdf)
    transformed_validation_rdd = transformed_validation.rdd.map(lambda x: (x.label, x.tokenized_abstract))
    lines = transformed_validation_rdd.map(csv_line)
    lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'validation'))

    # Serialize the tokenizer via MLeap and upload to S3
    SimpleSparkSerializer().serializeToBundle(tokenizer, "jar:file:/tmp/model.zip", transformed_validation)

    # Unzip as SageMaker expects a .tar.gz file but MLeap produces a .zip file.
    import zipfile
    with zipfile.ZipFile("/tmp/model.zip") as zf:
        zf.extractall("/tmp/model")

    # Write back the content as a .tar.gz file
    import tarfile
    with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar:
        tar.add("/tmp/model/bundle.json", arcname='bundle.json')
        tar.add("/tmp/model/root", arcname='root')

    s3 = boto3.resource('s3')
    file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz')
    s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz', file_name)

コード例 #5

0

ファイルを表示

ファイル: bbuzz2016-backup.py プロジェクト: ctavan/bbuzz2016

def run_tf_idf_spark_ml(df, numFeatures=1 << 20):
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsData = tokenizer.transform(df)

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numFeatures)
    featurizedData = hashingTF.transform(wordsData)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)

    return idfModel.transform(featurizedData)

コード例 #6

0

ファイルを表示

ファイル: desionTree.py プロジェクト: JallyHe/networkPublicOpinionAnalysisSystem

def predictLabel(label,title,model):
    """预测新闻的标签"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    return myprediction

コード例 #7

0

ファイルを表示

ファイル: spark_cluster.py プロジェクト: DataLAUSDEclassProject/spark

def create_features(raw_data):
    #Create DataFrame
    data_df = sqlContext.createDataFrame(raw_data.map(lambda r : Row(appid=r[0], price=r[1], sentence=r[2])))
    #Transform sentence into words
    tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
    words_df = tokenizer.transform(data_df)
    #Calculate term frequency
    hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=5)
    featurized_df = hashingTF.transform(words_df)
    #Calculate inverse document frequency
    idf = IDF(inputCol='rawFeatures', outputCol='features')
    idfModel = idf.fit(featurized_df)
    return idfModel.transform(featurized_df)

コード例 #8

0

ファイルを表示

ファイル: preprocessing.py プロジェクト: wingsrc/benchmark_minhash_lsh

def preprocessing_titles(path,name):
    query = preprocessData(path)
    tokenizer = Tokenizer(inputCol="title", outputCol="tokenized_title")
    wordsData = tokenizer.transform(query)
    #after Stopword removal
    remover = StopWordsRemover(inputCol="tokenized_title", outputCol="filtered")
    wordsData= remover.transform(wordsData)
    
    df = wordsData.map(lambda x:x['id']).zipWithUniqueId().toDF(["id","index"])
    df.registerTempTable("indices")
    wordsData.registerTempTable("words")
    
    qr = sqlContext.sql("SELECT index,words.id,filtered FROM indices JOIN words ON words.id = indices.id")
    if name!='':
        exportOnS3(qr,"s3a://redit-preprocessed/",name)
    qr = qr.map(lambda Row:(Row['index'],Row['id'],Row['filtered']))

コード例 #9

0

ファイルを表示

ファイル: pipelines.py プロジェクト: ngarneau/sentiment-analysis

 def _build_stages(self):
     self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed")
     self.tokenizer = Tokenizer(inputCol=self.bs_parser.getOutputCol(), outputCol="words")
     self.hashing_tf = HashingTF(inputCol=self.tokenizer.getOutputCol(), outputCol="raw_features")
     self.idf_model = IDF(inputCol=self.hashing_tf.getOutputCol(), outputCol="features")
     self.lr = LogisticRegression(maxIter=10, regParam=0.01)
     return [self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr]

コード例 #10

0

ファイルを表示

ファイル: MLPipeline.py プロジェクト: cjzamora/machine-learning

    def getPipeline(self, df):
        # notify pipeline 
        self.success('Initializing ML Pipeline ...')

        # initialize our tokenizer, we're going to tokenize features
        tokenizer = Tokenizer(inputCol='tag_features', outputCol='words')
        # convert the tokenize data to vectorize data
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='features')
        # initialize logistic regression algorithm
        lr        = LogisticRegression(maxIter=10, regParam=0.01)
        # create / initialize the ml pipeline
        pipeline  = Pipeline(stages=[tokenizer, hashingTF, lr])

        # fit the pipeline on our training dataframe
        model = pipeline.fit(df)

        return model

コード例 #11

0

ファイルを表示

ファイル: comments.py プロジェクト: marco-c/crashcorrelations

def get_top_words(dataset, signatures):
    # TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer
    # Or translate comments in other languages using the free Microsoft Translate API.
    sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1)))

    if sentenceData.rdd.isEmpty():
        return dict()

    # Tokenize comments.
    tokenizer = Tokenizer(inputCol='user_comments', outputCol='words')
    wordsData = tokenizer.transform(sentenceData)

    # Remove duplicate words from comments.
    wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words'])

    if wordsData.rdd.isEmpty():
        print("[WARNING]: wordsData is empty, sentenceData wasn't.")
        return dict()

    # Clean comment words by removing puntuaction and stemming.
    def clean_word(w):
        return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower()))

    wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words'])

    # XXX: Useless with TF-IDF?
    remover = StopWordsRemover(inputCol='words', outputCol='filtered')
    cleanWordsData = remover.transform(wordsData)

    cv = CountVectorizer(inputCol='filtered', outputCol='features')
    model = cv.fit(cleanWordsData)
    featurizedData = model.transform(cleanWordsData)

    idf = IDF(inputCol='features', outputCol='tfidf_features')
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect()

    return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])

コード例 #12

0

ファイルを表示

ファイル: ml_pipeline.py プロジェクト: PranavGoel/Python-Spark---Matrix-Multiplication---ML-pipeline

def main():
    '''
    takes one input argument :: Location of the directory for training and test data files.
    :return: Print output on console for the area under the ROC curve.
    '''

    conf = SparkConf().setAppName("MLPipeline")
    sc = SparkContext(conf=conf)

    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet("20news_train.parquet")

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # Fit the pipeline to training data.
    model = pipeline.fit(trainDF)

    numFeatures = (1000, 5000, 10000)
    regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)
    paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build()


    cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2)

    # Evaluate the model on testing data
    testDF = sqlCt.read.parquet("20news_test.parquet")
    prediction = model.transform(testDF)
    evaluator = BinaryClassificationEvaluator()


    model_cv = cv.fit(trainDF)
    prediction_cv = model_cv.transform(testDF)
    print evaluator.evaluate(prediction)
    print evaluator.evaluate(prediction_cv)

コード例 #13

0

ファイルを表示

ファイル: bbuzz2016-backup.py プロジェクト: ctavan/bbuzz2016

def run_tf_idf_spark_mllib(df, numFeatures=1 << 20):
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsData = tokenizer.transform(df)

    words = wordsData.select("words").rdd.map(lambda x: x.words)

    hashingTF = MllibHashingTF(numFeatures)
    tf = hashingTF.transform(words)
    tf.cache()

    idf = MllibIDF().fit(tf)
    tfidf = idf.transform(tf)

    # @TODO make this nicer
    tmp = sqlContext.createDataFrame(wordsData.rdd.zip(tfidf), ["data", "features"])
    tmp.registerTempTable("tmp")
    old_columns = ', '.join(map(lambda x: 'data.%s' % x, wordsData.columns))
    with_features = sqlContext.sql("SELECT %s, features FROM tmp" % old_columns)
    tmp = sqlContext.createDataFrame(with_features.rdd.zip(tf), ["data", "rawFeatures"])
    tmp.registerTempTable("tmp")
    old_columns = ', '.join(map(lambda x: 'data.%s' % x, with_features.columns))
    return sqlContext.sql("SELECT %s, rawFeatures FROM tmp" % old_columns)

コード例 #14

0

ファイルを表示

ファイル: spark_ml_pipline.py プロジェクト: Veterun/SparkPythonHanhan

def main():
    # Read training data as a DataFrame
    sqlCt = SQLContext(sc)
    trainDF = sqlCt.read.parquet(training_input)
    testDF = sqlCt.read.parquet(testing_input)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    evaluator = BinaryClassificationEvaluator()

    # no parameter tuning
    hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000)
    lr_notuning = LogisticRegression(maxIter=20, regParam=0.1)
    pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning])
    model_notuning = pipeline_notuning.fit(trainDF)

    prediction_notuning = model_notuning.transform(testDF)
    notuning_output = evaluator.evaluate(prediction_notuning)

    # for cross validation
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=20)

    paramGrid = ParamGridBuilder()\
        .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\
        .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\
        .build()

    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)
    cvModel = cv.fit(trainDF)

    # Make predictions on test documents. cvModel uses the best model found.
    best_prediction = cvModel.transform(testDF)
    best_output = evaluator.evaluate(best_prediction)

    s = str(notuning_output) + '\n' + str(best_output)
    output_data = sc.parallelize([s])
    output_data.saveAsTextFile(output)

コード例 #15

0

ファイルを表示

ファイル: pipelines.py プロジェクト: ngarneau/sentiment-analysis

class BaselinePipelineEngine(PipelineEngine):
    @keyword_only
    def __init__(self, cv):
        super(BaselinePipelineEngine, self).__init__(cv)
        self.hashing_tf_map = [pow(2, 20)]
        self.lr_map = [0.1, 0.01]
        self.stages = self._build_stages()
        self.pipeline = Pipeline(stages=[self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr])
        self.param_grid = self._build_param_grid()

    def _build_stages(self):
        self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed")
        self.tokenizer = Tokenizer(inputCol=self.bs_parser.getOutputCol(), outputCol="words")
        self.hashing_tf = HashingTF(inputCol=self.tokenizer.getOutputCol(), outputCol="raw_features")
        self.idf_model = IDF(inputCol=self.hashing_tf.getOutputCol(), outputCol="features")
        self.lr = LogisticRegression(maxIter=10, regParam=0.01)
        return [self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr]

    def _build_param_grid(self):
        param_grid_builder = ParamGridBuilder()
        param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map)
        param_grid_builder.addGrid(self.lr.regParam, self.lr_map)
        return param_grid_builder.build()

コード例 #16

0

ファイルを表示

ファイル: main.py プロジェクト: juan21258/Proyecto4_Telematica

path = './txt_p'
files = [f for f in os.listdir(path) if os.path.split(f)]
filecontent = len(files)
dataset = []
cont = 0
for f in files:
    j = os.path.join(path, f)
    with open(j, 'r') as myfile:
        data = myfile.read().replace('\n', '')
        cont = cont + 1
        dataset.append((cont, f, data))

rdd = sc.parallelize(dataset)
shemaData = rdd.map(lambda x: Row(num=x[0], title=x[1], text=x[2]))
dataFrame = sqlContext.createDataFrame(shemaData)
tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordsData = tokenizer.transform(dataFrame)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("title", "features").show()
#Normalizacion y transformada de la matriz
normalizer = Normalizer(inputCol="features", outputCol="norm")
data = normalizer.transform(rescaledData)

#Proceso de similaridad hallando la norma y el producto punto
mat = IndexedRowMatrix(
    data.select("num", "norm")\
        .rdd.map(lambda row: IndexedRow(row.num, row.norm.toArray()))).toBlockMatrix()

コード例 #17

0

ファイルを表示

ファイル: test_pyspark_ml_autologging.py プロジェクト: szczeles/mlflow

def test_gen_estimator_metadata(spark_session):  # pylint: disable=unused-argument
    tokenizer1 = Tokenizer(inputCol="text1", outputCol="words1")
    hashingTF1 = HashingTF(inputCol=tokenizer1.getOutputCol(),
                           outputCol="features1")

    tokenizer2 = Tokenizer(inputCol="text2", outputCol="words2")
    hashingTF2 = HashingTF(inputCol=tokenizer2.getOutputCol(),
                           outputCol="features2")

    vecAssembler = VectorAssembler(inputCols=["features1", "features2"],
                                   outputCol="features")

    lor = LogisticRegression(maxIter=10)
    ova = OneVsRest(classifier=lor)
    sub_pipeline1 = Pipeline(stages=[tokenizer1, hashingTF1])
    sub_pipeline2 = Pipeline(stages=[tokenizer2, hashingTF2])
    sub_pipeline3 = Pipeline(stages=[vecAssembler, ova])

    paramGrid = (ParamGridBuilder().addGrid(lor.maxIter, [10, 20]).addGrid(
        lor.regParam, [0.1, 0.01]).build())
    eva = MulticlassClassificationEvaluator()
    crossval = CrossValidator(estimator=sub_pipeline3,
                              estimatorParamMaps=paramGrid,
                              evaluator=eva,
                              numFolds=2)

    top_pipeline = Pipeline(stages=[sub_pipeline1, sub_pipeline2, crossval])

    metadata = _gen_estimator_metadata(top_pipeline)

    expected_hierarchy = {
        "name":
        "Pipeline_1",
        "stages": [
            {
                "name": "Pipeline_2",
                "stages": [{
                    "name": "Tokenizer_1"
                }, {
                    "name": "HashingTF_1"
                }]
            },
            {
                "name": "Pipeline_3",
                "stages": [{
                    "name": "Tokenizer_2"
                }, {
                    "name": "HashingTF_2"
                }]
            },
            {
                "name": "CrossValidator",
                "evaluator": {
                    "name": "MulticlassClassificationEvaluator"
                },
                "tuned_estimator": {
                    "name":
                    "Pipeline_4",
                    "stages": [
                        {
                            "name": "VectorAssembler"
                        },
                        {
                            "name": "OneVsRest",
                            "classifier": {
                                "name": "LogisticRegression"
                            }
                        },
                    ],
                },
            },
        ],
    }
    assert metadata.hierarchy == expected_hierarchy
    assert metadata.uid_to_indexed_name_map == {
        top_pipeline.uid: "Pipeline_1",
        sub_pipeline1.uid: "Pipeline_2",
        tokenizer1.uid: "Tokenizer_1",
        hashingTF1.uid: "HashingTF_1",
        sub_pipeline2.uid: "Pipeline_3",
        tokenizer2.uid: "Tokenizer_2",
        hashingTF2.uid: "HashingTF_2",
        crossval.uid: "CrossValidator",
        sub_pipeline3.uid: "Pipeline_4",
        vecAssembler.uid: "VectorAssembler",
        ova.uid: "OneVsRest",
        lor.uid: "LogisticRegression",
        eva.uid: "MulticlassClassificationEvaluator",
    }
    assert (metadata.uid_to_indexed_name_map[
        metadata.param_search_estimators[0].uid] == "CrossValidator")

コード例 #18

0

ファイルを表示

df_news = sqlContext.sql("SELECT Date, Top1,Top2,Top25 FROM combined_news_djia_csv")

num_word_features = 2000

#news data only goes to july 2016
df_news = sqlContext.sql("SELECT * FROM combined_news_djia_csv")
df_news = df_news.select("Date",concat(col("Top1"), lit(" "), col("Top2"), lit(" "), col("Top3"), lit(" "), col("Top4"), lit(" "), col("Top5"), lit(" "), col("Top6"), lit(" "), col("Top7"), lit(" "), col("Top8"), lit(" "), col("Top9"), lit(" "), col("Top10"), lit(" "), col("Top11"), lit(" "), col("Top12"), lit(" "), col("Top13"), lit(" "), col("Top14"), lit(" "), col("Top15"), lit(" "), col("Top16"), lit(" "), col("Top17"), lit(" "), col("Top18"), lit(" "), col("Top19"), lit(" "), col("Top20"), lit(" "), col("Top21"), lit(" "), col("Top22"), lit(" "), col("Top23"), lit(" "), col("Top24"), lit(" "), col("Top25")).alias("all_text_dirty"))

df_news = df_news.withColumn("all_text_1",regexp_replace(col("all_text_dirty"), "['\"]", ""))
df_news = df_news.withColumn("all_text",expr("substring(all_text_1, 2, length(all_text_1)+1)"))


df_news = df_news.dropna()

tokenizer = Tokenizer(inputCol="all_text", outputCol="words")
wordsData = tokenizer.transform(df_news)

remover = StopWordsRemover(inputCol="words", outputCol="wordsFil")
wordsDataFil = remover.transform(wordsData)

hashingTF = HashingTF(inputCol="wordsFil", outputCol="rawFeatures", numFeatures=num_word_features)
featurizedData = hashingTF.transform(wordsDataFil)
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="news_features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

#df_news = rescaledData.select("Date","news_features")

コード例 #19

0

ファイルを表示

ファイル: app_nlp.py プロジェクト: bbuchake/nlp-pyspark

#Import csv for training data
start_data = spark.read.format("csv").option("header", "true").load("data/sepsis.csv")
#DATA CLEANUP
#Remove NULLs
start_data = start_data.na.drop(subset=["CATEGORY","COMMENT"])
#Filter to ensure that category is pulled in correctly
start_data = start_data.filter(start_data['CATEGORY'].isin('include','exclude'))

#BUILD FEATURES
# Create a length column to be used as a future feature 
from pyspark.sql.functions import length
data = start_data.withColumn('length', length(start_data['COMMENT']))

# Create all the features to the data set
include_exclude_to_num = StringIndexer(inputCol='CATEGORY',outputCol='label')
tokenizer = Tokenizer(inputCol="COMMENT", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

# Create feature vectors
# See https://spark.apache.org/docs/latest/ml-features.html#vectorassembler
# This just creates a new, single vector of features that is the concatenation
# of tf-idf data and the length of the email
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

#DATA PROCESSING PIPELINE
# Create a and run a data processing Pipeline
# See https://spark.apache.org/docs/latest/ml-pipeline.html#pipeline
data_prep_pipeline = Pipeline(stages=[include_exclude_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])

コード例 #20

0

ファイルを表示

# removed stop words
# applied the hashing trick
# converted the data from counts to IDF and
# trained a linear regression model.
# Each of these steps was done independently. This seems like a great application for a pipeline!

# Instructions
# 100 XP
# Instructions
# 100 XP
# Create an object for splitting text into tokens.
# Create an object to remove stop words. Rather than explicitly giving the input column name, use the getOutputCol() method on the previous object.
# Create objects for applying the hashing trick and transforming the data into a TF-IDF. Use the getOutputCol() method again.
# Create a pipeline which wraps all of the above steps as well as an object to create a Logistic Regression model.

from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                           outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash")
idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])

コード例 #21

0

ファイルを表示

ファイル: Search.py プロジェクト: vatsla22/TFI-DF-Using-Pyspark

def sentence_data(df_data):
    df_data2 = df_data.select(df_data._id,
                              removepunctuations(df_data.text_entry))
    only_words = Tokenizer(inputCol='textentry', outputCol="words")
    df_data3 = only_words.transform(df_data2)
    return df_data3

コード例 #22

0

ファイルを表示

ファイル: engine.py プロジェクト: NoamRosenberg/Portfolio

	def trainModel(self):
		
		logger.info("Training the model...")		

		query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;'''

		def SQLtoURL(query):
    
    			data = query.replace('\n', ' ').replace('\t',' ').replace('   ',' ').replace('  ',' ')
    			return data


		def QueryXXXXX(query, file = None):
   
    			session = Session()
    			response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},)
    			return response.content

		table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table']
		title_list = [x['c'] for x in table['rows']]
		table_cols = [d['label'] for d in table['cols']]  
		def convert_row(row):
    			rowlist = [d['v'] for d in row]
    			return rowlist

		rd = self.sc.parallelize(title_list).map(convert_row)
		titleData = self.spark.createDataFrame(rd, table_cols)
		titleData = titleData.dropna()
		
		hebrew_stopwords = stop_words()
		def rmv(words):
    			for punc in punctuation:
        			words = words.replace(punc,"")
    			for hword in hebrew_stopwords:
        			words = words.replace(hword, " ")
    			return words

		self.spark.udf.register("rmv", rmv, StringType())
		titleData.registerTempTable("wordstable")
		cleanedSentenceData = self.spark.sql("select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable")
		tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words")
		wordsData = tokenizer.transform(cleanedSentenceData)

		cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF = 2.0)
		cvModel = cv.fit(wordsData)
		featurizedData = cvModel.transform(wordsData)

		idf = IDF(inputCol="rawFeatures", outputCol="features")
		idfModel = idf.fit(featurizedData)
		rescaledData = idfModel.transform(featurizedData)

		lda = LDA(k=100)
		ldaModel = lda.fit(rescaledData)
		postFactorizedData = ldaModel.transform(rescaledData)

		norm = Normalizer(inputCol = "topicDistribution", outputCol="normTopicDist")
		scaledFactorizedNormalizedData = norm.transform(postFactorizedData)
		
		self.model = scaledFactorizedNormalizedData
		
		logger.info("model is built!")

コード例 #23

0

ファイルを表示

ファイル: module8_cs2.py プロジェクト: voklymchuk/spark

def main():

    set_pandas_options()
    app_name = "Case Study 2: Email Analytics"

    conf = SparkConf().setAppName(app_name)
    conf = (conf.setMaster('local[*]').set(
        "spark.driver.host",
        "localhost").set('spark.executor.memory',
                         '4G').set('spark.driver.memory',
                                   '8G').set('spark.driver.maxResultSize',
                                             '10G'))
    sc = SparkContext(conf=conf)
    spark = SparkSession(sc)

    log4jLogger = sc._jvm.org.apache.log4j
    LOGGER = log4jLogger.LogManager.getLogger(__name__)
    LOGGER.info("pyspark script logger initialized")

    # 1 Load data into Spark DataFrame
    LOG = get_hdfs_filepath('*/*/*')

    # read text file
    log_txt_df = sc.wholeTextFiles(LOG).filter(lambda line: line != '').toDF()
    # Convert strings to columns
    udf1 = udf(to_utc_timestamp, TimestampType())
    df = log_txt_df
    df = df.select(df._2.alias('line'))
    udf1 = udf(to_utc_timestamp, TimestampType())
    temp = df.select(
        regexp_extract(col('line'), r'Message-ID:\s<.*>',
                       0).alias('Message_ID'),
        regexp_extract(
            col('line'),
            r'\d{1,2}\s\w{3}\s\d{4}\s\d{2}:\d{2}:\d{2}\s(\+|\-)\d{4}(.*)',
            0).alias("Date"),
        regexp_extract(col('line'), r'From:\s(.*)', 0).alias("From"),
        regexp_extract(
            col('line'),
            r"To:\s(.+)((?:\n|\r\n?)((?:(?:\n|\r\n?).+)+)){0,}(\S+@\S+)(?:\n|\r\n?)Subject:\s",
            0).alias("To"),
        regexp_extract(
            col('line'),
            r"Subject:\s(.+)((?:\n|\r\n?)((?:(?:\n|\r\n?).+)+)){0,}",
            1).alias("Subject"),
        regexp_extract(
            col('line'),
            r"Cc:\s(.+)((?:\n|\r\n?)((?:(?:\n|\r\n?).+)+)){0,}(?:\n|\r\n?)Mime-Version:\s",
            0).alias("Cc"),
        regexp_extract(col('line'), r'Mime-Version:\s(.+)',
                       1).alias("Mime_Version"),
        regexp_extract(col('line'), r'Content-Type:\s(.*)',
                       1).alias("Content_Type"),
        regexp_extract(col('line'), r"Content-Transfer-Encoding:\s(.+)",
                       1).alias("Content_Transfer_Encoding"),
        regexp_extract(col('line'), r"X-From:\s(.*)(?:\n|\r\n?)X-To:\s",
                       0).alias("X_From"),
        regexp_extract(col('line'), r'X-To:\s(.*)(?:\n|\r\n?)X-cc:\s',
                       0).alias("X_To"),
        regexp_extract(col('line'), r'X-cc:\s(.*)(?:\n|\r\n?)X-bcc:\s',
                       0).alias("X_cc"),
        regexp_extract(col('line'), r'X-bcc:\s(.*)(?:\n|\r\n?)X-Folder:\s',
                       0).alias("X_bcc"),
        regexp_extract(col('line'), r'X-Folder:\s(.*)(?:\n|\r\n?)X-Origin:\s',
                       0).alias("X_Folder"),
        regexp_extract(col('line'),
                       r"X-Origin:\s(.*)(?:\n|\r\n?)X-FileName:\s",
                       0).alias("X_Origin"),
        regexp_extract(col('line'), r"X-FileName:\s(.*)",
                       0).alias("X_FileName"),
        regexp_extract(
            col('line'),
            r"X-FileName:\s(.*)((?:\n|\r\n?){1,}(.*)){1,}((?:(?:\n|\r\n?).+)+)",
            0).alias("FYI"))
    #temp.cache()
    temp1 = temp.select(
        expr("substring(Message_ID, 14, length(Message_ID)-14)").alias(
            "Message_ID"), 'Date',
        udf1('Date').alias('UTC_timestamp'),
        expr("substring(From, 7, length(From)-6)").alias("From"),
        expr("substring(To, 5, length(To)-15)").alias("To"), "Subject",
        expr("substring(Cc, 5, length(Cc)-20)").alias("Cc"), "Mime_Version",
        "Content_Type", 'Content_Transfer_Encoding',
        expr("substring(X_From, 9, length(X_From)-16)").alias("X_From"),
        expr("substring(X_To, 7, length(X_To)-14)").alias("X_To"),
        expr("substring(X_cc, 7, length(X_cc)-15)").alias("X_cc"),
        expr("substring(X_bcc, 8, length(X_bcc)-19)").alias("X_bcc"),
        expr("substring(X_Folder, 11, length(X_Folder)-22)").alias("X_Folder"),
        expr("substring(X_Origin, 11, length(X_Origin)-24)").alias("X_Origin"),
        expr("substring(X_FileName, 13, length(X_FileName)-15)").alias(
            "X_FileName"),
        regexp_replace(
            col('FYI'),
            r"(X-FileName:\s(.*)(?:\n|\r\n?){1,})|(-*Original Message-*(.*)((?:\n|\r\n?){1,}(.*)){0,}((?:(?:\n|\r\n?).+)+))",
            '').alias('FYI'))
    #temp1.cache()
    result = temp1.select(
        "Message_ID", 'Date', 'UTC_timestamp', "From",
        regexp_replace(col('To'), r"\r\n\t", "").alias("To"), "Subject",
        regexp_replace(col('Cc'), r"\r\n\t", "").alias("Cc"), "Mime_Version",
        "Content_Type", 'Content_Transfer_Encoding', "X_From", "X_To", "X_cc",
        "X_bcc", "X_Folder", "X_Origin", "X_FileName",
        regexp_replace(col('FYI'), r"(^\s{1,})|(\n{2,})", '').alias('FYI'))
    zz = result.limit(5).toPandas()
    LOGGER.info(
        "\n\n1.\tLoad data into Spark DataFrame\tDone!\n\n{}\n".format(zz))

    # 2 Display the top 10 high-frequency users based on weekly numbers of emails sent
    df1 = result
    freq = df1.groupBy('From').agg(
        (count('UTC_timestamp') /
         ((max(unix_timestamp(col('UTC_timestamp'))) -
           min(unix_timestamp(col('UTC_timestamp')))) /
          604800)).alias('rate_per_week')).orderBy("rate_per_week",
                                                   ascending=False)
    zz = freq.limit(10).toPandas()
    LOGGER.info(
        "\n\n2.\tDisplay the top 10 high-frequency users based on weekly numbers of emails sent\tDone!\n\n{}\n"
        .format(zz))

    # 3a Extract top 20 keywords from the subject text for the top 10 high-frequency users
    top = freq.limit(10)
    top_subj = df1.join(top, df1["From"] == top["From"],
                        "inner").select(df1['From'], df1['Subject'])
    top_texts = top_subj.groupBy("From").agg(
        concat_ws(" ", collect_list("Subject")).alias("texts"))
    top_texts = top_texts.select('texts').agg(
        concat_ws(" ", collect_list("texts")).alias("subjects"))
    # Extract word
    from pyspark.ml.feature import Tokenizer
    tokenizer = Tokenizer().setInputCol("subjects").setOutputCol("words")
    transformed = tokenizer.transform(top_texts)
    # Extend the stop words dictionary by adding your own stop words such as -
    # Remove stopwords
    # custom stopwords
    stopwords = StopWordsRemover().getStopWords() + ["-", "re:", "", "fw"]
    remover = StopWordsRemover().setStopWords(stopwords).setInputCol(
        "words").setOutputCol("filtered")
    cleaned = remover.transform(transformed)
    # Extract top 20 keywords by identifying removing the common stop words
    # Generate features
    from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
    cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol(
        "features").fit(cleaned)
    featured = cvmodel.transform(cleaned)
    counts = featured.select('features').collect()
    a = cvmodel.vocabulary
    b = counts[0]['features'].values
    d = {'words': a, 'counts': b}
    df = pd.DataFrame(d)
    zz = df.head(20)
    LOGGER.info(
        "\n\n3a.\tExtract top 20 keywords from the subject text for the top 10 high-frequency users\tDone!\n\n{}\n"
        .format(zz))
    # 3b Extract top 20 keywords from the subject text for the non-high frequency users
    w = Window().orderBy(lit('A'))
    bottom = freq.orderBy("rate_per_week",
                          ascending=False).withColumn("row_num",
                                                      row_number().over(w))
    bottom = bottom.where(col('row_num') > 10).select('From', 'rate_per_week')
    bottom_subj = df1.join(bottom, df1["From"] == bottom["From"],
                           "inner").select(df1["From"], df1["Subject"])
    bottom_texts = bottom_subj.groupBy("From").agg(
        concat_ws(" ", collect_list("Subject")).alias("texts"))
    bottom_texts = bottom_texts.select('texts').agg(
        concat_ws(" ", collect_list("texts")).alias("subjects"))
    # Extract word
    tokenizer = Tokenizer().setInputCol("subjects").setOutputCol("words")
    transformed = tokenizer.transform(bottom_texts)
    # Remove stopwords
    # custom stopwords
    stopwords = StopWordsRemover().getStopWords() + [
        "-", "re:", "fw:", "", "&"
    ]
    remover = StopWordsRemover().setStopWords(stopwords).setInputCol(
        "words").setOutputCol("filtered")
    cleaned = remover.transform(transformed)
    # Generate features
    cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol(
        "features").fit(cleaned)
    featured = cvmodel.transform(cleaned)
    counts = featured.select('features').collect()
    a = cvmodel.vocabulary
    b = counts[0]['features'].values
    d = {'words': a, 'counts': b}
    df = pd.DataFrame(d)
    zz = df.head(20)
    LOGGER.info(
        "\n\n3b.\tExtract top 20 keywords from the subject text for the non-high frequency users\tDone!\n\n{}\n"
        .format(zz))

    # 6 Introduce a new column label to identify new, replied, and forwarded messages
    df = result

    def to_label(sbj):
        l1 = "RE" if sbj.startswith("RE:") else (
            "FW" if sbj.startswith("FW:") else 'NEW')
        return l1

    udf2 = udf(to_label, StringType())
    df_with_label = df.withColumn('label', udf2("Subject"))
    zz = df_with_label.limit(5).toPandas()
    LOGGER.info(
        "\n\n6.\tIntroduce a new column label to identify new, replied, and forwarded messages\tDone!\n\n{}\n"
        .format(zz))

    # 7 Get the trend of the over mail activity using the pivot table from spark itself
    pivotDF = df_with_label.groupBy(
        year("UTC_timestamp").alias('year'),
        month("UTC_timestamp").alias('month')).pivot("label").count().orderBy(
            "year", "month")
    zz = pivotDF.na.fill(0).toPandas()
    LOGGER.info(
        "\n\n7.\tGet the trend of the over mail activity using the pivot table from spark itself\tDone!\n\n{}\n"
        .format(zz))

    # 8 Use k-means clustering to create 4 clusters from the extracted keywords
    raw = result.select("Message_ID", "From", "Subject")
    # Extract word
    from pyspark.ml.feature import Tokenizer
    tokenizer = Tokenizer().setInputCol("Subject").setOutputCol("words")
    transformed = tokenizer.transform(raw)
    # Remove stopwords
    # custom stopwords
    stopwords = StopWordsRemover().getStopWords() + [
        "-", "re:", "fw:", "", "&"
    ]
    remover = StopWordsRemover().setStopWords(stopwords).setInputCol(
        "words").setOutputCol("filtered")
    cleaned = remover.transform(transformed)
    cleaned = cleaned.select("Message_ID", "words", "filtered")
    # Generate features
    from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
    cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol(
        "features").fit(cleaned)
    featured = cvmodel.transform(cleaned)
    kmeans = KMeans(k=4, seed=1)  # 4 clusters here
    model = kmeans.fit(featured.select('features'))
    transformed = model.transform(featured)
    zz = transformed.limit(5).toPandas()
    LOGGER.info(
        "\n\n8.\tUse k-means clustering to create 4 clusters from the extracted keywords\tDone!\n\n{}\n"
        .format(zz))

    # 9 Use LDA to generate 4 topics from the extracted keywords
    LOGGER.info(
        "\n\n9.\tUse LDA to generate 4 topics from the extracted keywords\tDone!\n\n{}\n{}\n{}\n{}\n"
        .format(get_topic(0, transformed), get_topic(1, transformed),
                get_topic(2, transformed), get_topic(3, transformed)))

コード例 #24

0

ファイルを表示

ファイル: test_lda.py プロジェクト: lyzKF/180-china

    #print "loading 20 newsgroups dataset..."
    tic = time()
    dataset = fetch_20newsgroups(shuffle=True, random_state=0, remove=('headers','footers','quotes'))
    train_corpus = dataset.data  # a list of 11314 documents / entries
    toc = time()
    print ("elapsed time: %.4f sec" %(toc - tic)   ) 

    #distribute data
    corpus_rdd = sc.parallelize(train_corpus)
    corpus_rdd = corpus_rdd.map(lambda doc: re.sub(r"[^A-Za-z]", " ", doc))
    corpus_rdd = corpus_rdd.map(lambda doc: u"".join(doc).encode('utf-8').strip())

    rdd_row = corpus_rdd.map(lambda doc: Row(raw_corpus=str(doc)))
    newsgroups = spark.createDataFrame(rdd_row)

    tokenizer = Tokenizer(inputCol="raw_corpus", outputCol="tokens")
    newsgroups = tokenizer.transform(newsgroups)
    newsgroups = newsgroups.drop('raw_corpus')       

    stopwords = StopWordsRemover(inputCol="tokens", outputCol="tokens_filtered")
    newsgroups = stopwords.transform(newsgroups)
    newsgroups = newsgroups.drop('tokens')

    count_vec = CountVectorizer(inputCol="tokens_filtered", outputCol="tf_features", vocabSize=num_features, minDF=2.0)
    count_vec_model = count_vec.fit(newsgroups)
    vocab = count_vec_model.vocabulary
    newsgroups = count_vec_model.transform(newsgroups)
    newsgroups = newsgroups.drop('tokens_filtered')

    #hashingTF = HashingTF(inputCol="tokens_filtered", outputCol="tf_features", numFeatures=num_features)
    #newsgroups = hashingTF.transform(newsgroups)

コード例 #25

0

ファイルを表示

from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
import numpy as np

# Remove rows with null values
reviewText_data = csv_data.select('reviewText')
reviewText_data = reviewText_data.na.drop()

# Register a 'function' to clean text
cleantext = spark.udf.register("cleantext", clean_text)

# Cleaned reviewText data
clean_reviewText_data = reviewText_data.select(cleantext("reviewText").alias("reviewText"))

# Convert sentences into discrete words
tokenizer = Tokenizer(inputCol="reviewText", outputCol="words")

# Calculate term frequency for each word
tf = CountVectorizer(inputCol="words", outputCol="tf", vocabSize=2**6, minDF=0.05, minTF=1)

# Calculate IDF given the term frequency
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=1) #minDocFreq: remove sparse terms

# Fit the cleaned reviewText data through the pipeline
pipeline = Pipeline(stages=[tokenizer, tf, idf])
pipelineFit = pipeline.fit(clean_reviewText_data)
train_df = pipelineFit.transform(clean_reviewText_data)

# Save TF_IDF as text files in datanodes
train_df.rdd.saveAsTextFile("hdfs://ec2-34-239-131-131.compute-1.amazonaws.com:9000/output15/")

コード例 #26

0

ファイルを表示

def lower_text(line):
    word_list=re.findall('[\w_]+', line.lower())
    return ' '.join(map(str, word_list))

filter_data_withColumn = filter_data.withColumn("text_lower", udf(lower_text, StringType())("Text")).select('text_lower','Score')

#Showing the result
filter_data_withColumn.show(15)


# # Tokenize

# In[11]:


tokenize = Tokenizer(inputCol="text_lower", outputCol="words")
words_Data_Frame = tokenize.transform(filter_data_withColumn)
words_Data_Frame.take(5)


# # Remove Stopword

# In[12]:


remove = StopWordsRemover(inputCol="words", outputCol="filtered_words")
words_Data_Frame1 = remove.transform(words_Data_Frame).select("filtered_words","Score")
words_Data_Frame1.show(5)


# # Stemming

コード例 #27

0

ファイルを表示

ファイル: aws_rw.py プロジェクト: Jadatravu/Tutorials

spark = SQLContext(sc)

#LOADING DATA FROM HDFS TO SPARK DATAFRAME
df0=spark.read.option("sep", "\t").option('header',True).csv('hdfs://192.168.50.93:9000/user/hadoop/books2/amazon_reviews_us_Wireless_v1_00.tsv')
df0.printSchema()

#FILTERING FOR EMPTY VALUES
df01 = df0.filter((col("review_body").isNotNull()) & (col("verified_purchase").isNotNull()))

#ENCODING LABEL
stage_string = StringIndexer(inputCol="verified_purchase", outputCol="class_res")
ppl = Pipeline(stages=[stage_string])
df1 = ppl.fit(df01).transform(df01)

#CREATING TF_IDF
tokenizer = Tokenizer(inputCol="review_body", outputCol="words")
wordsData = tokenizer.transform(df1)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

#NAIVEBAYES 
nb = NaiveBayes(featuresCol="features", labelCol="class_res")

#Model training
model = nb.fit(rescaledData)

#Model Saving
model.write().overwrite().save("./NB_model")

コード例 #28

0

ファイルを表示

ファイル: LR.py プロジェクト: naritapandhe/MLProject

trainingCount=parts.count()
f = indexedTweets.map(lambda p: Row(tindex=int(p[1]),tweet=p[0][0], label= int(float(p[0][1])), training=1))
#f = parts.map(lambda p: Row(tweet=p[0],label=int(p[1])))

linest = sc.textFile("/home/ankita/MLProject/SVM/GroundTruth.txt")

partst = linest.map(lambda l: l.split(","))
indexedTweetst = partst.zipWithIndex().map(lambda (a,b): (a,b+trainingCount))
ft = indexedTweetst.map(lambda p: Row(tindex=int(p[1]),tweet=p[0][1], label= int(float(p[0][0])),training=0))
alldata = f.union(ft)

schemaTweets = sqlContext.createDataFrame(alldata)

schemaTweets.registerTempTable("data")

tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
wordsData = tokenizer.transform(schemaTweets)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
featurizedData = hashingTF.transform(wordsData)


idf = IDF(inputCol="rawFeatures", outputCol="features")



idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
#rescaledData.collect()
wordsvectors = rescaledData.filter(rescaledData.training==1)["label","features"].map(lambda row: LabeledPoint(row[0], row[1]))
model = LogisticRegressionWithLBFGS.train(wordsvectors, iterations=100)

コード例 #29

0

ファイルを表示

ファイル: spark_sql (1).py プロジェクト: NavyaraoYB/AmazonFoodReviews

    u'more', u'most', u'must', u'my', u'myself', u'no', u'nor', u'not', u'now',
    u'o', u'of', u'off', u'on', u'once', u'only', u'or', u'other', u'our',
    u'ours', u'ourselves', u'out', u'over', u'own', u'r', u're', u's', 'said',
    u'same', u'she', u'should', u'shouldnt', u'so', u'some', u'such', u't',
    u'than', u'that', 'thats', u'the', u'their', u'theirs', u'them',
    u'themselves', u'then', u'there', u'these', u'they', u'this', u'those',
    u'through', u'to', u'too', u'under', u'until', u'up', u'very', u'was',
    u'wasnt', u'we', u'were', u'werent', u'what', u'when', u'where', u'which',
    u'while', u'who', u'whom', u'why', u'will', u'with', u'wont', u'would',
    u'y', u'you', u'your', u'yours', u'yourself', u'yourselves'
]

stopwordsRemover = StopWordsRemover(
    inputCol="words1", outputCol="filtered").setStopWords(add_stopwords)

tokenizer = Tokenizer(inputCol="Text", outputCol="tokens")
hashtf = HashingTF(numFeatures=2**16, inputCol="filtered", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features",
          minDocFreq=5)  #minDocFreq: remove sparse terms

# bag of words count

#hashtf = HashingTF(numFeatures=2**16, inputCol="tokens", outputCol='tf')
#
#idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
#
#

pipeline = Pipeline(
    stages=[regexTokenizer, stopwordsRemover, tokenizer, hashtf, idf])

コード例 #30

0

ファイルを表示

df_review = df_review.filter("cool >=3 or useful >=3 or funny >=3")
df_review = df_review.select("stars", "text")
df_review = df_review.repartition(100)

from pyspark.sql import functions as F
df_review = df_review.withColumn("target",
                                 F.when(df_review.stars <= 2, 1).otherwise(0))
df_review.cache()

(train_set, test_set) = df_review.randomSplit([0.7, 0.3], seed=1002)

from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol="text", outputCol="words")
cv = CountVectorizer(vocabSize=2**16, inputCol="words", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="features",
          minDocFreq=5)  #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol="target", outputCol="label")
#lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, cv, idf, label_stringIdx])

pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(test_set)

lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)
#predictions = predictions.select('target','label', 'rawPrediction', 'probability', 'prediction')

コード例 #31

0

ファイルを表示

ファイル: test.py プロジェクト: harryranproject/AutoTag

	except KafkaError as ke:
		logger.debug('Fail to start kafka producer, caused by %s' % ke.message)

	try:
		# Create dstream from kafka topic
		directKafkaStream = KafkaUtils.createDirectStream(ssc, kafka_topic, {'metadata.broker.list' = broker_ip})
		logger.debug('Create direct dstream from kafka successfully')
	except:
		logger.debug('Unable to create dstream from kafka')

	atexit.register(shutdown_hook, kafka_producer, spark)

	# Load in idf_model, nb_model, hashing_tf, idf_model and tag_catId map
	try:
		logger.debug('Loading models')
		tokenizer = Tokenizer.load(tokenizer_file)
		hashing_tf = HashingTF.load(hashing_tf_file)
		idf_model = IDFModel.load(idf_model_file)
		nb_model = NaiveBayesModel.load(nb_model_file)
		selected_tags = pd.read_csv(selected_tags_file, header=None)
		local_catId_to_tags = dict(zip(list(selected_tags.index), selected_tags[0]))
		local_tags_to_catId=dict(zip(selected_tags[0], list(selected_tags.index)))
		catId_to_tags = sc.broadcast(local_catId_to_tags)
		tags_to_catId = sc.broadcast(local_tags_to_catId)
		tags_to_catId_transform = udf(lambda tag: float(tags_to_catId.value[tag]), FloatType())
		catId_to_tags_transform = udf(lambda catId: catId_to_tags.value[catId], StringType())
		logger.debug('loaded models successfully')
	except:
		logger.debug('Fail to load models')

コード例 #32

0

ファイルを表示

ファイル: preprocess-spark-text-to-tfidf.py プロジェクト: vanessa920/aws-workshop

def transform(spark, s3_input_data, s3_output_train_data,
              s3_output_validation_data, s3_output_test_data):
    print('Processing {} => {}'.format(s3_input_data, s3_output_train_data,
                                       s3_output_validation_data,
                                       s3_output_test_data))

    schema = StructType([
        StructField('marketplace', StringType(), True),
        StructField('customer_id', StringType(), True),
        StructField('review_id', StringType(), True),
        StructField('product_id', StringType(), True),
        StructField('product_parent', StringType(), True),
        StructField('product_title', StringType(), True),
        StructField('product_category', StringType(), True),
        StructField('star_rating', IntegerType(), True),
        StructField('helpful_votes', IntegerType(), True),
        StructField('total_votes', IntegerType(), True),
        StructField('vine', StringType(), True),
        StructField('verified_purchase', StringType(), True),
        StructField('review_headline', StringType(), True),
        StructField('review_body', StringType(), True),
        StructField('review_date', StringType(), True)
    ])

    df_csv = spark.read.csv(path=s3_input_data,
                            sep='\t',
                            schema=schema,
                            header=True,
                            quote=None)
    df_csv.show()

    # This dataset should already be clean, but always good to double-check
    print('Showing null review_body rows...')
    df_csv.where(col('review_body').isNull()).show()

    df_csv_cleaned = df_csv.na.drop(subset=['review_body'])
    df_csv_cleaned.where(col('review_body').isNull()).show()

    tokenizer = Tokenizer(inputCol='review_body', outputCol='words')
    wordsData = tokenizer.transform(df_csv_cleaned)

    hashingTF = HashingTF(inputCol='words',
                          outputCol='raw_features',
                          numFeatures=1000)
    featurizedData = hashingTF.transform(wordsData)

    # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
    # 1) compute the IDF vector
    # 2) scale the term frequencies by IDF
    # Therefore, we cache the result of the HashingTF transformation above to speed up the 2nd pass
    featurizedData.cache()

    # spark.mllib's IDF implementation provides an option for ignoring terms
    # which occur in less than a minimum number of documents.
    # In such cases, the IDF for these terms is set to 0.
    # This feature can be used by passing the minDocFreq value to the IDF constructor.
    idf = IDF(inputCol='raw_features', outputCol='features')  #, minDocFreq=2)
    idfModel = idf.fit(featurizedData)
    features_df = idfModel.transform(featurizedData)
    features_df.select('star_rating', 'features').show()

    num_features = 300
    pca = PCA(k=num_features, inputCol='features', outputCol='pca_features')
    pca_model = pca.fit(features_df)
    pca_features_df = pca_model.transform(features_df).select(
        'star_rating', 'pca_features')
    pca_features_df.show(truncate=False)

    standard_scaler = StandardScaler(inputCol='pca_features',
                                     outputCol='scaled_pca_features')
    standard_scaler_model = standard_scaler.fit(pca_features_df)
    standard_scaler_features_df = standard_scaler_model.transform(
        pca_features_df).select('star_rating', 'scaled_pca_features')
    standard_scaler_features_df.show(truncate=False)

    expanded_features_df = (standard_scaler_features_df.withColumn(
        'f', to_array(col('scaled_pca_features'))).select(
            ['star_rating'] + [col('f')[i] for i in range(num_features)]))
    expanded_features_df.show()

    train_df, validation_df, test_df = expanded_features_df.randomSplit(
        [0.9, 0.05, 0.05])

    train_df.write.csv(path=s3_output_train_data, header=None, quote=None)
    print('Wrote to output file:  {}'.format(s3_output_train_data))

    validation_df.write.csv(path=s3_output_validation_data,
                            header=None,
                            quote=None)
    print('Wrote to output file:  {}'.format(s3_output_validation_data))

    test_df.write.csv(path=s3_output_test_data, header=None, quote=None)
    print('Wrote to output file:  {}'.format(s3_output_test_data))

コード例 #33

0

ファイルを表示

ファイル: DocumentSummarization.py プロジェクト: BhavyaKashetty/DocumentSummarization

def tokenize(inputDF):
    tokenizer = Tokenizer(inputCol='sentences', outputCol='tokenizedwords')
    tokenized = tokenizer.transform(inputDF)
    return tokenized

コード例 #34

0

ファイルを表示

train_datafile = get_args().input
train_df = spark.read.csv(train_datafile,header=True,sep='\t').limit(80000)

# using 1000 records as a small set debugging data
train_sents1 = train_df.select('genre', 'sentence1')
train_sents2 = train_df.select('genre', 'sentence2')
# train_sents1.show(5)

udf_lower = F.udf(lower_folding, StringType() )
train_sents1_lower = train_sents1.withColumn('lower_sents', udf_lower('sentence1') )
# train_sents1_lower.show(5)

udf_rv_punc = F.udf(remove_punctuation_re, StringType() )
train_sents1_rv_punc = train_sents1_lower.withColumn('rv_punc_sents', udf_rv_punc('lower_sents') )

tokenizer = Tokenizer(inputCol="rv_punc_sents", outputCol="tokens")
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
w2v = Word2Vec(vectorSize=300, minCount=0, inputCol="filtered_tokens", outputCol="avg_word_embed")

doc2vec_pipeline = Pipeline(stages=[tokenizer,remover,w2v])
doc2vec_model = doc2vec_pipeline.fit(train_sents1_rv_punc)
doc2vecs_df = doc2vec_model.transform(train_sents1_rv_punc)
w2v_train_df, w2v_test_df = doc2vecs_df.randomSplit([0.8, 0.2])

from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

genre2label = StringIndexer(inputCol="genre", outputCol="label")
rf_classifier = MultilayerPerceptronClassifier(labelCol="label", featuresCol="avg_word_embed")

コード例 #35

0

ファイルを表示

ファイル: LR.py プロジェクト: naritapandhe/KBSApp

lines=sc.textFile("/Users/admin/Desktop/KBSApp/KBSApp/permissionsData/dataSets/SVMDataSet.txt")

parts = lines.map(lambda l: l.split(","))
f = parts.map(lambda p: Row(tindex=int(p[0]),packageName=p[1],packagePermissions=p[2], label= int(float(p[3])),training=1))


linest = sc.textFile("/Users/admin/Desktop/KBSApp/KBSApp/permissionsData/dataSets/SVMDataGroundTruth.txt")
partst = linest.map(lambda l: l.split(","))
ft = partst.map(lambda p: Row(tindex=int(p[0]),packageName=p[1],packagePermissions=p[2],label= int(float(p[3])),training=0))
alldata = f.union(ft)

schemaApp = sqlContext.createDataFrame(alldata)

schemaApp.registerTempTable("data")

tokenizer = Tokenizer(inputCol="packagePermissions", outputCol="perms")
permsData = tokenizer.transform(schemaApp)

hashingTF = HashingTF(inputCol="perms", outputCol="rawFeatures")
featurizedData = hashingTF.transform(permsData)


idf = IDF(inputCol="rawFeatures", outputCol="features")


idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

wordsvectors = rescaledData["label","features"].map(lambda row: LabeledPoint(row[0], row[1]))
model = LogisticRegressionWithLBFGS.train(wordsvectors, iterations=100)

コード例 #36

0

ファイルを表示

ファイル: L10-TextMining.py プロジェクト: anursapa/datascienceIoTclass

# Count number of Words in each Text
from pyspark.sql.functions import length
data = data.withColumn('length', length(data['text']))
data.show()

# In[3]:
# Compare the lenght difference between ham and spam
data.groupby('class').mean().show()

# In[4]:
# Treat TF-IDF features for each text
# TF: Term Frequency
# IDF: Inverse Document Frequency
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer, VectorAssembler

tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')
idf = IDF(inputCol="c_vec", outputCol="tf_idf")
ham_spam_to_num = StringIndexer(inputCol='class',outputCol='label')
final_feature = VectorAssembler(inputCols=['tf_idf', 'length'],outputCol='features')

from pyspark.ml import Pipeline
data_prep_pipe = Pipeline(stages=[ham_spam_to_num,tokenizer,stopremove,count_vec,idf,final_feature])
clean_data = data_prep_pipe.fit(data).transform(data)

clean_data.show()
clean_data.take(1)
clean_data.take(1)[0][-1]

コード例 #37

0

ファイルを表示

ファイル: tf_idf_example.py プロジェクト: 1574359445/spark

from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("TfIdfExample")\
        .getOrCreate()

    # $example on$
    sentenceData = spark.createDataFrame([
        (0, "Hi I heard about Spark"),
        (0, "I wish Java could use case classes"),
        (1, "Logistic regression models are neat")
    ], ["label", "sentence"])

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    # alternatively, CountVectorizer can also be used to get term frequency vectors

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    for features_label in rescaledData.select("features", "label").take(3):
        print(features_label)
    # $example off$

    spark.stop()

コード例 #38

0

ファイルを表示

ファイル: word2vec.py プロジェクト: tankistqazwsx/univer_kl

def create_w2v_model():
    spark = SparkSession \
        .builder \
        .appName("SimpleApplication") \
        .config("spark.executor.memory", "2g") \
        .config("spark.driver.memory", "2g") \
        .config("spark.memory.offHeap.enabled", True) \
        .config("spark.memory.offHeap.size", "2g") \
        .getOrCreate()

    input_file = spark.sparkContext.wholeTextFiles(PATH)

    print("""
    
    Подготовка данных (1)...
    
    """)
    prepared_data = input_file.map(lambda x: (x[0], remove_punctuation(x[1])))

    print("""
    
    Подготовка данных (2)...
    
    """)
    df = prepared_data.toDF()

    print("""
    
    Подготовка данных (3)...
    
    """)
    prepared_df = df.selectExpr('_2 as text')

    print("""
    
    Разбитие на токены...
    
    """)
    tokenizer = Tokenizer(inputCol='text', outputCol='words')
    words = tokenizer.transform(prepared_df)

    print("""
    
    Очистка от стоп-слов...
    
    """)
    stop_words = StopWordsRemover.loadDefaultStopWords('russian')
    remover = StopWordsRemover(inputCol="words",
                               outputCol="filtered",
                               stopWords=stop_words)

    print("""
    
    Построение модели...
    
    """)
    word2Vec = Word2Vec(vectorSize=50,
                        inputCol='words',
                        outputCol='result',
                        minCount=2)
    model = word2Vec.fit(words)

    print("""
    
    Сохранение модели...
    
    """)
    today = datetime.datetime.today()
    model_name = today.strftime("model/kurs_model")
    print("""
    
    Model  """ + model_name + """  saved
    
    """)
    model.save(model_name)

    spark.stop()

コード例 #39

0

ファイルを表示

ファイル: Advanced_Analytics_and_Machine_Learning-Chapter_29_Unsupervised_Learning.py プロジェクト: yehonatc/Spark-The-Definitive-Guide


# COMMAND ----------

summary = model.summary
print model.weights
model.gaussiansDF.show()
summary.cluster.show()
summary.clusterSizes
summary.probability.show()


# COMMAND ----------

from pyspark.ml.feature import Tokenizer, CountVectorizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.drop("features"))
cv = CountVectorizer()\
  .setInputCol("DescOut")\
  .setOutputCol("features")\
  .setVocabSize(500)\
  .setMinTF(0)\
  .setMinDF(0)\
  .setBinary(True)
cvFitted = cv.fit(tokenized)
prepped = cvFitted.transform(tokenized)


# COMMAND ----------

from pyspark.ml.clustering import LDA

コード例 #40

0

ファイルを表示

    for sentence in review:
        word_tokens = word_tokenize(sentence)
        for w in word_tokens:
            if w not in stop_words:
                w = ps.stem(w)
                final = final + " " + w
        filtered_sentence.append(final)
        final = ""

    review = filtered_sentence
    # print("\n \n -----------------------------------------------------------------------------------------------------: \n " +  str(review))

    # doing the bag of words algorithm
    dup_vector = zip(calification, review)
    sentenceData = spark.createDataFrame(dup_vector, ["label", "sentence"])
    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(
        inputCol="words", outputCol="rawFeatures", numFeatures=100
    )  # numFeatures is the distincts diferents  words that there are in the document, would be a good thing to do a wordcount here
    featurizedData = hashingTF.transform(wordsData)
    # alternatively, CountVectorizer can also be used to get term frequency vectors
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    # rescaledData.select("label", "features").show(20,False) # to show dataframe structure

    # print(len(review)) # printing the size of both arrays for indexing acknolegement
    # print(len(calification))

    #print(review)  # just to test what does the array have

コード例 #41

0

ファイルを表示

ファイル: spark.py プロジェクト: wavelets/predict_reddit_comments

import argparse

from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
from pyspark.ml.feature import Tokenizer

def filter_comments(df):
    return df.filter(df['author'] != '[deleted]') \
             .filter(df['body'] != '[deleted]') \
             .filter(df['body'] != '[removed]')

if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Reddit Comment Prediction')
    parser.add_argument('-i', '--input_file', type=str, 
        help="""The CSV input data file that contains the raw comment data""")
    args = parser.parse_args()

    sc = SparkContext("local", "Prediction")
    sqlContext = SQLContext(sc)
    df = sqlContext.read.json(args.input_file)
    print 'Loaded input file {} with {} total comments'.format(args.input_file, df.count())

    filtered = filter_comments(df)
    print '{} comments after filtering'.format(filtered.count())

    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsDataFrame = tokenizer.transform(filtered)
    wordsDataFrame.select("body", "words").show()

コード例 #42

0

ファイルを表示

ファイル: data_format.py プロジェクト: o9812/Big-Data-2018

    def clustering(self, columns='*', num_cluster=2, n_g=2):
        """
        input
            @ a column list and remove special char column
            @ number of cluster
            @ number of n_gram
        return:
            @ a data frame with clustering column
        """

        # let text data as string with blank space
        n_gr = n_g
        data_frame_1 = self._df
        # check data type, if not string turn it into string
        valid_cols = [col for (col, typ) in filter(lambda typ: typ[1] == 'string', self._df.dtypes)]
        # turn it into string
        if columns not in valid_cols:
            data_frame_1 = data_frame_1.withColumn(columns + '_', data_frame_1[columns].cast("string"))
            data_frame_1 = data_frame_1.drop(columns)
            data_frame_1 = data_frame_1.withColumnRenamed(columns + '_', columns)

        # make the string toknizable
        udf_space = udf(lambda z: " ".join(z))
        data_frame_1 = data_frame_1.withColumn(columns + '_split', udf_space(columns)).orderBy(columns)
        # Token the word and do the bi-gram
        tokenizer = Tokenizer(inputCol=columns + '_split', outputCol=columns + "_token")
        data_frame_2 = tokenizer.transform(data_frame_1)
        # make features like n-gram
        ngram = NGram(n=n_gr, inputCol=columns + "_token", outputCol=columns + "_ngram")
        ngramDataFrame = ngram.transform(data_frame_2)
        # vectorization: text map to vector
        cv = CountVectorizer(inputCol=columns + "_ngram", outputCol="features", vocabSize=10, minDF=1.0)
        # fit the vectorization
        model = cv.fit(ngramDataFrame)
        result = model.transform(ngramDataFrame)


        # setup kmeans
        kmeans = KMeans().setK(num_cluster).setSeed(1)
        model_kmean = kmeans.fit(result)
        predictions_kmean = model_kmean.transform(result)

        df = predictions_kmean.orderBy('prediction', ascending=True).select(self._df.schema.names + ['prediction'])
        # reshape the table, user are easy to read it
        print('show each count numbers in this row')
        temp = df.groupBy(columns, 'prediction').count()
        temp = temp.withColumnRenamed('prediction', 'cluster')
        df = df.withColumnRenamed('prediction', 'cluster')
        temp = temp.withColumnRenamed('count', 'count in cluster')
        temp.show()

        # show the cluster number
        window = Window.partitionBy("cluster").orderBy(col("count in cluster").desc())
        test = (temp.withColumn('row_num', F.row_number().over(window)).where(F.col('row_num') == 1).select(columns, 'cluster'))
        print('Defult replace: replace the mode of a instance each cluster')
        test.orderBy('cluster', ascending=True).show()

        # turn the modest number to list
        test_list = test.select(columns).orderBy('cluster').collect()

        # name_list = [i.columns for i in test_list]
        name_list = [i[columns] for i in test_list]

        list_setting = input("Type 'yes' to enter customized replace words or press any key for default replace setting: \n")
        # let the usr defined the replaced word
        count = 0
        if list_setting == 'yes':
            count = 0
            while(count < num_cluster):
                usr_replace = input('Enter the cluster {0} shoud be, or press enter to skip: \n'.format(count))
                if usr_replace != '':
                    name_list[count] = usr_replace
                else:
                    name_list[count] = name_list[count]
                count += 1

        # replace the words
        udf_place_name = udf(lambda z: name_list[z])
        data_frame_replace = df.withColumn('replace_' + columns, udf_place_name('cluster'))

        replace_input = input('type yes to replace origin column, press any key to keep the origin column:\n')
        # change the origin dataframe
        if replace_input == 'yes':
            data_frame_replace = data_frame_replace.drop(columns)
            data_frame_replace = data_frame_replace.withColumnRenamed("replace_" + columns, columns)

        data_frame_replace = data_frame_replace.drop('cluster')
        #  replace the origin dataframe
        self._df = data_frame_replace
        #  show result to users
        self._df.show()
        return self

コード例 #43

0

ファイルを表示

ファイル: script2_nocv.py プロジェクト: pifouuu/ProjetBigData

def cleanLower(doc):
    return doc.replace("<br /><br />"," ").lower()
rdd = labeledRdd.map(lambda doc : (cleanLower(doc[0]),doc[1]))

print "Text is cleaned"


sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])

print "Random split is done"


tokenizer = Tokenizer(inputCol='review', outputCol='reviews_words')
hashing_tf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)

pipeline = Pipeline(stages=[tokenizer,
                            hashing_tf,
                            idf,
                            string_indexer,
                            dt])

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

# grid=(ParamGridBuilder()
#      .baseOn([evaluator.metricName,'precision'])

コード例 #44

0

ファイルを表示

from pyspark.sql import SQLContext
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

conf = SparkConf().setAppName("MLPipeline")
sc = SparkContext(conf=conf)

# Read training data as a DataFrame
sqlCt = SQLContext(sc)
trainDF = sqlCt.read.parquet("20news_train.parquet")

# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                      outputCol="features",
                      numFeatures=1000)
lr = LogisticRegression(maxIter=20, regParam=0.1)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# Fit the pipeline to training data.
model = pipeline.fit(trainDF)

#Building the cross validation model
paramGrid = (ParamGridBuilder().addGrid(
    hashingTF.numFeatures, [1000, 5000, 10000]).addGrid(
        lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]).build())

crossval = CrossValidator(estimator=pipeline,

コード例 #45

0

ファイルを表示

df2 = df.withColumn('date', F.unix_timestamp('date', form).cast('timestamp'))
print(df2.show(5))

df = df2

import matplotlib.pyplot as plt
dates = df.select(F.date_format(
    'date',
    'yyyy-MM-dd').alias('no_timestamp')).groupby('no_timestamp').count().sort(
        F.col('no_timestamp'))
print(dates.show(dates.count()))
dates.toPandas().plot(kind='line', x='no_timestamp', y='count')

dates.toPandas().plot(kind='bar', x='no_timestamp')

tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
prep_df = tokenizer.transform(df)
cv_prep = CountVectorizer(inputCol="words", outputCol="prep")
cv_model = cv_prep.fit(prep_df)
ready_df = cv_model.transform(prep_df)
# stopWords = [word for word in cv_prep.vocabulary if any(char.isdigit() for char in word)]
# remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords = stopwords)
# prep_df = remover.transform(prep_df)

trainable = ready_df.select(
    'tweet_id', 'prep').rdd.map(lambda x, y: [x, Vectors.fromML(y)]).cache()
print("Trainable")
print(trainable.take(10))
print("take")
model = LDA.train(trainable, k=5, seed=1, optimizer="online")
exit(0)

コード例 #46

0

ファイルを表示

from pyspark.sql import SparkSession

if __name__ == "__main__":
    sparkSession = SparkSession\
        .builder\
        .getOrCreate()

    # Prepare training documents from a list of (id, text, label) tuples.
    training = sparkSession.createDataFrame([(0, "a b c d e spark", 1.0),
                                             (1, "b d", 0.0),
                                             (2, "spark f g h", 1.0),
                                             (3, "hadoop mapreduce", 0.0)],
                                            ["id", "text", "label"])

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and logistic regression.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")

    tk = tokenizer.transform(training)
    tk.printSchema()
    tk.show()

    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    hs = hashingTF.transform(tk)
    hs.printSchema()
    hs.show()

    logistic_regression = LogisticRegression(maxIter=10, regParam=0.001)

    pipeline = Pipeline(stages=[tokenizer, hashingTF, logistic_regression])

コード例 #47

0

ファイルを表示

        return list()
    return [lemmtizer.lemmatize(word) for word in input_list]


spark = SparkSession.builder.appName("TfIdf-Lemmetization").getOrCreate()

lemmetize = F.udf(lemmetize)
# spark.udf.register("lemmetize", lemmetize)

documents = spark.read.text("dataset/*.txt")
documents = documents.withColumn("doc_id",
                                 F.row_number().over(Window.orderBy('value')))

documents.printSchema()
# creating tokens/words from the sentence data
tokenizer = Tokenizer(inputCol="value", outputCol="words")
wordsData = tokenizer.transform(documents)
wordsData.show()

stemmer = SnowballStemmer(language='english')
stemmer_udf = F.udf(lambda tokens: [stemmer.stem(token) for token in tokens],
                    ArrayType(StringType()))
wordsData = wordsData.withColumn("lemms", stemmer_udf("words"))

# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3,
                    minCount=0,
                    inputCol="lemms",
                    outputCol="result")
model = word2Vec.fit(wordsData)
result = model.transform(wordsData)

コード例 #48

0

ファイルを表示

dataSet = dataSet.withColumn('cleanReview', cleanText(
    F.col('reviews'))).filter(F.col('cleanReview') != '')
dataSet.show()

# %%
dataSet = dataSet.withColumn('class', dataSet['class'].cast(IntegerType()))
dataSet = dataSet.select('class', 'cleanReview').withColumnRenamed(
    'cleanReview', 'reviews')

# %%
trainDF, testDF = dataSet.randomSplit([0.8, 0.2])
trainDF.show()
testDF.show()

# %%
tokenizer = Tokenizer(inputCol="reviews", outputCol="tokens")
countVector = CountVectorizer(inputCol=tokenizer.getOutputCol(),
                              outputCol='features')
idf = IDF(inputCol=countVector.getOutputCol(), outputCol='idf')
pipeline = Pipeline(stages=[tokenizer, countVector, idf])
pipelineModel = pipeline.fit(trainDF)

# %%
pTrainDF = pipelineModel.transform(trainDF)
pTestDF = pipelineModel.transform(testDF)

# %%
evaluator = MulticlassClassificationEvaluator(labelCol="class",
                                              predictionCol="prediction",
                                              metricName="f1")
lr = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol='class')

コード例 #49

0

ファイルを表示

ファイル: word-cloud.py プロジェクト: yuguang/reddit-comments

from pyspark.sql import SQLContext
from pyspark.sql.functions import desc, explode
from pyspark.sql.types import *
from storage import Sqlite

PARTITIONS = 500
THRESHOLD = 50

if __name__ == "__main__":
    conf = SparkConf().setAppName("reddit")
    conf.set('spark.serializer', 'org.apache.spark.serializer.KryoSerializer')
    conf.set('spark.local.dir', '/mnt/work')
    conf.set('spark.driver.maxResultSize', '12g')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    fields = [StructField("subreddit", StringType(), True),
          StructField("body", StringType(), True)]
    rawDF = sqlContext.read.json("file:///mnt/s3/2015/*", StructType(fields))
    # split comments into words
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsDataFrame = tokenizer.transform(rawDF)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    filteredDataFrame = remover.transform(wordsDataFrame)
    # explode terms into individual rows
    termDataFrame = filteredDataFrame.select(['subreddit', explode(filteredDataFrame.filtered).alias("term")])
    # group by subreddit and term, then count occurence of term in subreddits
    countsDataFrame = termDataFrame.groupBy(['subreddit', 'term']).count()

    db =  Sqlite()
    countsDataFrame.select(['subreddit', 'term', 'count']).filter('count > {}'.format(THRESHOLD)).foreachPartition(db.saveSubredditWords)

コード例 #50

0

ファイルを表示

ファイル: reglog_nocv_simple.py プロジェクト: pifouuu/ProjetBigData

labeledRdd = sc.parallelize(labeledData)


from pyspark.sql import SQLContext 
def preProcess(doc):
    clean = doc.replace("<br /><br />"," ")
    return clean.lower()
rdd = labeledRdd.map(lambda doc : (preProcess(doc[0]),doc[1]))

sqlContext = SQLContext(sc)

df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])

from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol='review', outputCol='words')
dfTrainTok = tokenizer.transform(dfTrain)

import itertools
lists=dfTrainTok.map(lambda r : r.review).collect()
dictWords=set(itertools.chain(*lists))
dictionaryWords={}
for i,word in enumerate(dictWords):
	dictionaryWords[word]=i

dict_broad=sc.broadcast(dictionaryWords)

from pyspark.mllib.linalg import SparseVector
def vectorize(row,dico):
    vector_dict={}
    for w in row.words:

コード例 #51

0

ファイルを表示

data = df6.select(
    'id',
    (lower(regexp_replace('comment_text', "[^a-zA-Z\\s]", "")).alias('text')),
    'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate')
data = data.select('id', (regexp_replace('text', "[\r\n]+", "").alias('text')),
                   'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
                   'identity_hate')
data.na.drop()
data.na.fill(0)
clean = data.where(col('toxic').isNotNull()).where(
    col('severe_toxic').isNotNull()).where(col('obscene').isNotNull()).where(
        col('threat').isNotNull()).where(col('insult').isNotNull()).where(
            col('identity_hate').isNotNull())

# Token Parser
tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordToken = tokenizer.transform(clean)

# Delete Stop Words
remover = StopWordsRemover(inputCol='words', outputCol='words_clean')
dataFrameNoStop = remover.transform(wordToken)

# Term Frequency
hashTermFreq = HashingTF(inputCol="words_clean", outputCol="rawFeatures")
termFreq = hashTermFreq.transform(dataFrameNoStop)

# Term Frequency ID Frequency
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(termFreq)
tfidf = idfModel.transform(termFreq).select('features', 'toxic',
                                            'severe_toxic', 'obscene',

コード例 #52

0

ファイルを表示

ファイル: RandomForest_TF-IDF.py プロジェクト: rbkasat/CSYE7374_FinalProject

    review_text = BeautifulSoup(raw_review).text
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                                            
    # 
    # 4. Remove stop words
    meaningful_words =  [w for w in words if not w in stops]   
    #
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    return " ".join( meaningful_words)   

stops = set(stopwords.words("english")) 
lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv")
rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys()
parts = rows.map(lambda l: l.split("\t"))

review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), 
	review=review_to_words(p[2])))
schemeReview = sqlContext.createDataFrame(review)
tokenizer = Tokenizer(inputCol="review", outputCol="words")
wordsData = tokenizer.transform(schemeReview)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
selectData = rescaledData.select("label","features")

コード例 #53

0

ファイルを表示

ファイル: Ex2d.3.py プロジェクト: wel51x/Machine_Learning_and_Spark

print('\n', sms.dtypes, '\n')

sms.printSchema()

# Remove punctuation (REGEX provided) and numbers
wrangled = sms.withColumn('text',
                          regexp_replace(sms.text, '[_():;,.!?\\-]', ' '))
wrangled = wrangled.withColumn('text',
                               regexp_replace(wrangled.text, '[0-9]', ' '))

# Merge multiple spaces
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +',
                                                      ' '))

# Text to tokens
wrangled = Tokenizer(inputCol="text", outputCol="words").transform(wrangled)

# Remove stop words.
wrangled = StopWordsRemover(inputCol="words",
                            outputCol="terms").transform(wrangled)

# Apply the hashing trick
wrangled = HashingTF(inputCol="terms", outputCol="hash",
                     numFeatures=1024).transform(wrangled)

# Convert hashed symbols to TF-IDF
sms = IDF(inputCol="hash",
          outputCol="features").fit(wrangled).transform(wrangled)

# View the first four records
sms.show(4, truncate=False)

コード例 #54

0

ファイルを表示

ファイル: tokenizer_example.py プロジェクト: 0xqq/spark

# limitations under the License.
#

from __future__ import print_function

from pyspark import SparkContext
from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import Tokenizer, RegexTokenizer
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="TokenizerExample")
    sqlContext = SQLContext(sc)

    # $example on$
    sentenceDataFrame = sqlContext.createDataFrame([
        (0, "Hi I heard about Spark"),
        (1, "I wish Java could use case classes"),
        (2, "Logistic,regression,models,are,neat")
    ], ["label", "sentence"])
    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    wordsDataFrame = tokenizer.transform(sentenceDataFrame)
    for words_label in wordsDataFrame.select("words", "label").take(3):
        print(words_label)
    regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
    # alternatively, pattern="\\w+", gaps(False)
    # $example off$

    sc.stop()

コード例 #55

0

ファイルを表示

ファイル: pyspark_tutorials.py プロジェクト: linghui-wu/LargeScaleComputing_A20

auc = binary_evaluator.evaluate(prediction, {binary_evaluator.metricName: "areaUnderROC"})

# Turning Text into Tables
# term-document text
# A selection of children's books
books.show(truncate=False)

# Removing punctuation
from pyspark.sql.functions import regexp_replace
# Regular expression (REGEX) to match commas and hyphens
REGEX = '[,\\-]'
books = books.withColumn('text', regexp_replace(books.text, REGEX, ' '))

# Text to tokens
from pyspark.ml.feature import Tokenizer
books = Tokenizer(inputCol='text', outputCol='tokens').transform(books)

# Remove stop words
from pyspark.ml.feature import StopWordsRemover
stopwords = StopWordsRemover()
# Take a look at the list of stop words
stopwords.getStopWords()
# Spevify the input and output column names
stopwords = stopwords.setInputCol('tokens').setOutputCol('words')
books = stopwords.transform(books)

# Feature hashing
from pyspark.ml.feature import HashingTF
hasher = HashingTF(inputCol='words', outputCol='hash', numFeatures=32)
books = hasher.transoform(books)

コード例 #56

0

ファイルを表示

ファイル: script3_bis.py プロジェクト: pifouuu/ProjetBigData

print "Create dataframe"
t0 = time()
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
print "Showing first example : "
print
print df.first()
tt = time() - t0
print
print "Dataframe created in {} second".format(round(tt,3))


# In[314]:

from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol='review', outputCol='words')
dfTok = tokenizer.transform(df)


# In[315]:

from pyspark.ml.feature import NGram
bigram = NGram(inputCol="words", outputCol="bigrams")
dfBigram = bigram.transform(dfTok)


# In[317]:

print "Start tokenizing, computing bigrams and splitting between test and train"
t0 = time()
dfTrain, dfTest = dfBigram.randomSplit([0.8,0.2])

コード例 #57

0

ファイルを表示

ファイル: tokenizer_sample.py プロジェクト: minis21/wikibook_spark

from pyspark.ml.feature import Tokenizer
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("tokenizer_sample") \
    .master("local[*]") \
    .getOrCreate()

data = [(0, "Tokenization is the process"), (1, "Refer to the Tokenizer")]
inputDF = spark.createDataFrame(data).toDF("id", "input")
tokenizer = Tokenizer(inputCol="input", outputCol="output")
outputDF = tokenizer.transform(inputDF)
outputDF.printSchema()
outputDF.show()

spark.stop

コード例 #58

0

ファイルを表示

ファイル: pipeline_explore.py プロジェクト: chrisgarcia001/Explorations

	for category_dir in listdir(input_dir): # Build the dataset of (docname, category, wordcounts) tuples
		distinct_labels[curr_cat] = category_dir
		next_docs = sc.wholeTextFiles(('/').join([input_dir, category_dir])) 
		docs = docs.union(next_docs.map(lambda (doc, lines): (format_text(lines), float(curr_cat))))
		curr_cat += 1
	
	training_rows = docs.sample(False, train_fraction)
	testing_rows = docs.subtract(training_rows)
	
	# Prepare training and test documents, which are labeled.
	LabeledDocument = Row("text", "label")
	train = training_rows.map(lambda x: LabeledDocument(*x)).toDF()
	test = testing_rows.map(lambda x: LabeledDocument(*x)).toDF()		

    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
	tokenizer = Tokenizer(inputCol="text", outputCol="words")
	hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures") #outputCol="features")
	idf = IDF(inputCol="rawFeatures", outputCol="features")
	
	lr = LogisticRegression(maxIter=1000, regParam=0.001)
	#pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
	p0 = Pipeline(stages=[tokenizer, hashingTF, idf ,lr])
	#m0 = p0.fit(train)
	#pipeline = Pipeline(stages=[m0, lr])
	pipeline = p0
	
	# Fit the pipeline to training documents.
	model = pipeline.fit(train)
	print('\n\n --------------- RESULT ----------------------\n\n')
	print(model.transform(test).head())
	print('\n\n ---------------------------------------------\n\n')

コード例 #59

0

ファイルを表示

ファイル: tokenizer.py プロジェクト: ylcnky/PySpark_MLExec

from __future__ import print_function
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.sql import SparkSession

if __name__ == '__main__':
    spark = SparkSession\
        .builder\
        .appName('Tokenizer')\
        .getOrCreate()

    sentenceDataFrame = spark.createDataFrame([
        (0, "Hi I heard about Spark"),
        (1, "I wish Java could use case classes"),
        (2, "Logistic,regression,models,are,neat")
    ], ["id", "sentence"])

    tokenizer = Tokenizer(inputCol = 'sentence', outputCol='words')
    regexTokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern = "\\W")
    countTokens = udf(lambda words: len(words), IntegerType())
    tokenized = tokenizer.transform(sentenceDataFrame)
    tokenized.select('sentence','words')\
        .withColumn('tokens', countTokens(col('words'))).show(truncate=False)

    regexTokenized = regexTokenizer.transform(sentenceDataFrame)
    regexTokenized.select('sentence', 'words') \
        .withColumn('tokens', countTokens(col('words'))).show(truncate=False)

    spark.stop