def main(args): textFiles = sc.wholeTextFiles(maindir + '4').map(readContents) #print "READ second {} check ".format(textFiles.take(10)) ''' filter the rows based on all the index available in training file else drop http://stackoverflow.com/questions/24718697/pyspark-drop-rows ''' htmldf = sqlContext.createDataFrame(textFiles) htmldf.cache() traindf = getCleanedRDD(maindir + 'train_v2.csv', ["id", "images", "links", "text", "label"], htmldf) traindf.write.save(maindir+"output/train_4.parquet", format="parquet") # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=20, regParam=0.01) rf = GBTClassifier(maxIter=30, maxDepth=4, labelCol="label") rf = RandomForestClassifier(labelCol="features", numTrees=3, maxDepth=4) #https://databricks.com/blog/2015/07/29/new-features-in-machine-learning-pipelines-in-spark-1-4.html #http://spark.apache.org/docs/latest/api/python/pyspark.ml.html #w2v = Word2Vec(inputCol="text", outputCol="w2v") rfc = RandomForestClassifier(labelCol="label", numTrees=3, maxDepth=4) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training documents. model = pipeline.fit(traindf) print '-----------------------------------------------------------------------------' testdf = getCleanedRDD(maindir + 'test.csv', ["id", "images", "links", "text", "label"], htmldf) #print testdf.count() # Make predictions on test documents and print columns of interest. prediction = model.transform(testdf) #print('prediction', prediction) ''' pand = prediction.toPandas() pand.to_csv('testpanda.csv', sep='\t', encoding='utf-8') print "Done!!! CSV" ''' #prediction.select('id','probability','prediction').write.format('com.databricks.spark.csv').option("header", "true").save(maindir + 'output/result_lr0.csv') # ('prediction', DataFrame[id: string, images: bigint, links: bigint, text: string, label: double, # words: array<string>, features: vector, rawPrediction: vector, probability: vector, prediction: double]) ''' #write in scala selected = prediction.select("id", "probability", "prediction") for row in selected.collect(): print row ''' sc.stop()
from pyspark.ml.feature import HashingTF, IDF, Tokenizer #spark-submit --master local[*] --packages com.databricks:spark-csv_2.10:1.2.0 cluster.py sc = SparkContext() sqlContext = SQLContext(sc) text = sc.textFile('file:/Users/wangmengyuan/Desktop/rr/listings.txt').map(lambda l:l.split('\t'))\ .map(lambda l: (l[0],l[1])) df = sqlContext.createDataFrame(text, ["houseid", "description"]) tokenizer = Tokenizer(inputCol="description", outputCol="tokens") tokenized = tokenizer.transform(df).cache() remover = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens") stopWordsRemoved_df = remover.transform(tokenized).cache() hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=200) tfVectors = hashingTF.transform(stopWordsRemoved_df).cache() idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) idfModel = idf.fit(tfVectors) tfIdfVectors = idfModel.transform(tfVectors).cache() normalizer = Normalizer(inputCol="features", outputCol="normFeatures") l2NormData = normalizer.transform(tfIdfVectors) kmeans = KMeans().setK(10).setMaxIter(20) km_model = kmeans.fit(l2NormData) clustersTable = km_model.transform(l2NormData) #save to hdfs df1 = clustersTable[['houseid', 'prediction']] #df1.select('houseid', 'prediction').write.format('com.databricks.spark.csv').save('cluster.csv') df1.select('houseid', 'prediction').show(20)
from pyspark.ml.feature import CountVectorizer count_vec=CountVectorizer(inputCol='refined_tokens',outputCol='features') cv_df=count_vec.fit(refined_df).transform(refined_df) cv_df.select(['user_id',"business_id", "review_id", 'refined_tokens','features']).show(1,True, True) count_vec.fit(refined_df).vocabulary from pyspark.ml.feature import HashingTF,IDF hashing_vec=HashingTF(inputCol='refined_tokens',outputCol='tf_features') hashing_df=hashing_vec.transform(refined_df) hashing_df.select(['user_id','refined_tokens','tf_features']).show(4,True, True) tf_idf_vec=IDF(inputCol='tf_features',outputCol='tf_idf_features') tf_idf_df=tf_idf_vec.fit(hashing_df).transform(hashing_df) tf_idf_df.select('tf_idf_features').show(1,True, True) tf_idf_df.show(1, True, True) def get_dummy(df, indexCol, categoricalCols, continuousCols, labelCol, dropLast=False): from pyspark.ml import Pipeline
tokenizer = RegexTokenizer(inputCol="text", outputCol="words", minTokenLength=3, gaps=False, pattern="[a-zA-Z]+") ## Remove ignored words stopWordsRemover = StopWordsRemover( inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) ## Create GBM model gbm = H2OGBM(ratio=0.8, featuresCols=[idf.getOutputCol()], predictionCol="label") ## Remove all helper columns colPruner = ColumnPruner(columns=[ idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(),
# MAGIC %md ### Define the Pipeline # MAGIC The pipeline for the model consist of the following stages: # MAGIC - A Tokenizer to split the tweets into individual words. # MAGIC - A StopWordsRemover to remove common words such as "a" or "the" that have little predictive value. # MAGIC - A HashingTF class to generate numeric vectors from the text values. # MAGIC - A LogisticRegression algorithm to train a binary classification model. # COMMAND ---------- # convert sentence to words' list tokenizer = Tokenizer(inputCol="text", outputCol="SentimentWords") # remove stop words swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="MeaningfulWords") # convert word to number as word frequency hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features") # set the model lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10, regParam=0.01) # process pipeline with the series of transforms - 4 transforms pipeline = Pipeline(stages=[tokenizer, swr, hashTF, lr]) # COMMAND ---------- # MAGIC %md ### Run the Pipeline as an Estimator # MAGIC The pipeline itself is an estimator, and so it has a **fit** method that we called to run the pipeline on a specified DataFrame. In this case, we ran the pipeline on the training data to train a model. # COMMAND ----------
spark = SparkSession.builder.appName("TfIdf-Ngram").getOrCreate() documents = spark.read.text("dataset/*.txt") documents = documents.withColumn("doc_id", F.row_number().over(Window.orderBy('value'))) documents.printSchema() # creating tokens/words from the sentence data tokenizer = Tokenizer(inputCol="value", outputCol="words") wordsData = tokenizer.transform(documents) ngram = NGram(n=2, inputCol="words", outputCol="ngrams") ngramDataFrame = ngram.transform(wordsData) # applying tf on the words data hashingTF = HashingTF(inputCol="ngrams", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(ngramDataFrame) # alternatively, CountVectorizer can also be used to get term frequency vectors # calculating the IDF idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) # displaying the results rescaledData.select("doc_id", "features").show(truncate=False) # closing the spark session spark.stop()
def getvalue3(): if request.method == 'POST': subreddit_input = request.form['subreddit'] #subreddit_input = 'World Politics' subreddit_filter = requests.get( url + 'reddit_post.json?orderBy="subreddit"&equalTo="' + str(subreddit_input) + '"') subreddits = json.loads(subreddit_filter.text) results = [] for x in subreddits: try: results.append(subreddits[x]) except KeyError: continue data = pd.DataFrame.from_dict(results, orient='columns') data1 = spark.createDataFrame(pd.DataFrame(data["title"])) data1.show(truncate=False) clean_data_udf = udf(clean_data, StringType()) data1 = data1.withColumn("new_title", clean_data_udf("title")) data1.show() tokenizer = Tokenizer(inputCol="new_title", outputCol="words") data1 = tokenizer.transform(data1) data1.show() remover = StopWordsRemover(inputCol="words", outputCol="rm_words") data1 = remover.transform(data1) data1.show() hashingTF = HashingTF(inputCol="rm_words", outputCol="rawFeatures", numFeatures=2000) data1 = hashingTF.transform(data1) data1.show() data1.select("rm_words").show(truncate=False) data1.select("rawFeatures").show(truncate=False) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(data1) data1 = idfModel.transform(data1) data1.select("features").show(truncate=False) kmeans = KMeans(k=2, featuresCol="features").setSeed(1) kmeans_model = kmeans.fit(data1) data1 = kmeans_model.transform(data1) data1.select("prediction").show(50) data["prediction"] = data1.select("prediction").toPandas() print(data["prediction"].value_counts()) #topic_generator(subreddit_input) topic1 = data[data['prediction'] == 0]['title'].reset_index(drop=True) topic2 = data[data['prediction'] == 1]['title'].reset_index(drop=True) topic1_1 = topic1[0] topic1_2 = topic1[1] topic1_3 = topic1[2] topic1_4 = topic1[3] topic1_5 = topic1[4] topic2_1 = topic2[0] topic2_2 = topic2[1] topic2_3 = topic2[2] topic2_4 = topic2[3] topic2_5 = topic2[4] return render_template('title_topic_update.html', topic1_1=topic1_1, topic1_2=topic1_2, topic1_3=topic1_3, topic1_4=topic1_4, topic1_5=topic1_5, topic2_1=topic2_1, topic2_2=topic2_2, topic2_3=topic2_3, topic2_4=topic2_4, topic2_5=topic2_5) else: return render_template('title_topic.html')
# 變成n字一組 # NGram is a feature transformer that converts the input array of strings into an array of n-grams. Null values in the input array are ignored. It returns an array of n-grams where each n-gram is represented by a space-separated string of words. When the input is empty, an empty array is returned. When the input array length is less than n (number of elements per n-gram), no n-grams are returned. ngram_df = NGram(n=2, inputCol="words", outputCol="ngrams").transform(words) ngram_df.show(truncate=False) ngram_df.select("ngrams").show(truncate=False) # TF-IDF is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.[1] It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling. df = words.select("words") df.show(truncate=False) # Hashing TF is TF with hashing enabled to allow the feature vector to be a set value df_tf = HashingTF( inputCol="words", outputCol="hashing_tf", numFeatures=15 #預設是262144維 ).transform(df) df_tf.show() df_tf.select("words").show(truncate=False) df_tf.select("hashing_tf").show(truncate=False) #第一個list代表詞的index,第2個list代表詞出現次數 # IDF df_tf_idf = IDF(inputCol="hashing_tf", outputCol="tf_idf").fit(df_tf).transform(df_tf) df_tf_idf.show() df_tf_idf.select("words").show(truncate=False) df_tf_idf.select("hashing_tf").show(truncate=False) # Hashing TF
def test_save_load_pipeline_estimator(self): temp_path = tempfile.mkdtemp() training = self.spark.createDataFrame([ (0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0), (4, "b spark who", 1.0), (5, "g d a y", 0.0), (6, "spark fly", 1.0), (7, "was mapreduce", 0.0), ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") ova = OneVsRest(classifier=LogisticRegression()) lr1 = LogisticRegression().setMaxIter(5) lr2 = LogisticRegression().setMaxIter(10) pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100]) \ .addGrid(ova.classifier, [lr1, lr2]) \ .build() tvs = TrainValidationSplit( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator()) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assert_param_maps_equal(loadedTvs.getEstimatorParamMaps(), paramGrid) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) # Run train validation split, and choose the best set of parameters. tvsModel = tvs.fit(training) # test save/load of CrossValidatorModel tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid) self.assertEqual(len(loadedModel.bestModel.stages), len(tvsModel.bestModel.stages)) for loadedStage, originalStage in zip(loadedModel.bestModel.stages, tvsModel.bestModel.stages): self.assertEqual(loadedStage.uid, originalStage.uid) # Test nested pipeline nested_pipeline = Pipeline( stages=[tokenizer, Pipeline(stages=[hashingTF, ova])]) tvs2 = TrainValidationSplit( estimator=nested_pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator()) tvs2Path = temp_path + "/tvs2" tvs2.save(tvs2Path) loadedTvs2 = TrainValidationSplit.load(tvs2Path) self.assert_param_maps_equal(loadedTvs2.getEstimatorParamMaps(), paramGrid) self.assertEqual(loadedTvs2.getEstimator().uid, tvs2.getEstimator().uid) # Run train validation split, and choose the best set of parameters. tvsModel2 = tvs2.fit(training) # test save/load of CrossValidatorModel tvsModelPath2 = temp_path + "/tvsModel2" tvsModel2.save(tvsModelPath2) loadedModel2 = TrainValidationSplitModel.load(tvsModelPath2) self.assertEqual(loadedModel2.bestModel.uid, tvsModel2.bestModel.uid) loaded_nested_pipeline_model = loadedModel2.bestModel.stages[1] original_nested_pipeline_model = tvsModel2.bestModel.stages[1] self.assertEqual(loaded_nested_pipeline_model.uid, original_nested_pipeline_model.uid) self.assertEqual(len(loaded_nested_pipeline_model.stages), len(original_nested_pipeline_model.stages)) for loadedStage, originalStage in zip( loaded_nested_pipeline_model.stages, original_nested_pipeline_model.stages): self.assertEqual(loadedStage.uid, originalStage.uid)
train_df.fillna('', inplace = True) train_df = spark.createDataFrame(train_df) test_df = pd.read_csv('test.csv') test_df.fillna('', inplace = True) test_df = spark.createDataFrame(test_df) out_cols = [i for i in train_df.columns if i not in ['id', 'comment_text']] # train_df.filter(F.col('toxic') == 1).show(5) #Tokenizer: tokenizer = Tokenizer(inputCol = 'comment_text', outputCol = 'words') words_data = tokenizer.transform(train_df) # hashing_tf = HashingTF(inputCol = 'words', outputCol = 'rawFeatures') tf = hashing_tf.transform(words_data) tf.select('rawFeatures').take(5) tf.count(), len(tf.columns) idf = IDF(inputCol = 'rawFeatures', outputCol = 'features') idfModel = idf.fit(tf) tf_idf = idfModel.transform(tf) #### #Performing the logistic regression: REG = 0.01 lr = LogisticRegression(featuresCol = 'features', labelCol = 'toxic', regParam = REG)
def main(): spark = SQLContext(SparkContext.getOrCreate()) # read data yahoo = spark.read.csv(f'{BUILDDIR}/yahoo.csv', header=True) data = yahoo.select(['sector', 'description']).dropna() # tokenize texts based on regular expression tokenize = RegexTokenizer(inputCol='description', outputCol='words_all', pattern=r'\W') # remove stop words stopwords = '\n'.join((DATADIR / 'stopwords' / f).read_text().strip() for f in ('mysql.txt', 'nltk.txt')).splitlines() remove_stopwords = StopWordsRemover( inputCol='words_all', outputCol='words_clean').setStopWords(stopwords) # get words frequency using simple count (bag of words) add_wordcount = CountVectorizer(inputCol='words_clean', outputCol='words_count', vocabSize=1000, minDF=2) # get tf-idf words frequencies add_wordtf = HashingTF(inputCol='words_clean', outputCol='words_tf', numFeatures=10000) add_wordidf = IDF(inputCol='words_tf', outputCol='words_tfidf', minDocFreq=2) # prepare output values index_target = StringIndexer(inputCol='sector', outputCol='label') # data preparation pipeline pipeline_wordcount = Pipeline(stages=[ tokenize, remove_stopwords, add_wordcount, add_wordtf, add_wordidf, index_target, ]) # apply data preparation pipeline model_wordcount = pipeline_wordcount.fit(data) prepared = model_wordcount.transform(data) breakpoint() # split to training and testing training, testing = prepared.randomSplit([0.8, 0.2], seed=100500) # fit logistic regression models logistic_wordcount = LogisticRegression(regParam=0.3, elasticNetParam=0, featuresCol='words_count', labelCol='label', predictionCol='prediction', probabilityCol='probability') logistic_tfidf = LogisticRegression(regParam=0.3, elasticNetParam=0, featuresCol='words_tfidf', labelCol='label', predictionCol='prediction', probabilityCol='probability') evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', metricName='accuracy') for model, name in ((logistic_wordcount, 'Word count + Logistic regression'), (logistic_tfidf, 'TF-IDF + Logistic regression')): predicted = model.fit(training).transform(testing) print(f'{name} model accuracy = {evaluator.evaluate(predicted)}')
A quick reminder about these concepts: The hashing trick provides a fast and space-efficient way to map a very large (possibly infinite) set of items (in this case, all words contained in the SMS messages) onto a smaller, finite number of values. The TF-IDF matrix reflects how important a word is to each document. It takes into account both the frequency of the word within each document but also the frequency of the word across all of the documents in the collection. The tokenized SMS data are stored in sms in a column named words. You've cleaned up the handling of spaces in the data so that the tokenized text is neater. Instructions 100 XP Import the StopWordsRemover, HashingTF and IDF classes. Create a StopWordsRemover object (input column words, output column terms). Apply to sms. Create a HashingTF object (input results from previous step, output column hash). Apply to wrangled. Create an IDF object (input results from previous step, output column features). Apply to wrangled. ''' SOLUTION from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF # Remove stop words. wrangled = StopWordsRemover(inputCol='words', outputCol='terms')\ .transform(sms) # Apply the hashing trick wrangled = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024)\ .transform(wrangled) # Convert hashed symbols to TF-IDF tf_idf = IDF(inputCol='hash', outputCol='features')\ .fit(wrangled).transform(wrangled) tf_idf.select('terms', 'features').show(4, truncate=False)
############################################################################################### # Pipeline ############################################################################################### # Tokenize by word tokenizer = Tokenizer(inputCol="text", outputCol="words") # Remove stop words in the text stopword = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="no_stops", stopWords=swords) # The cheaper way to do TF-IDF # Creates a hash that contains the term frequency # This mean there are no pairs with the value 0 # It'll output: (number_of_words {index_from_previous: value, ...}) with no value = 0 # If the value is 0, the index_from_previous will skip so there can be key that go # 0, 1, 6, 8, ... etc all based on the contents of the previous step hashingTF = HashingTF(inputCol=stopword.getOutputCol(), outputCol="hashing") # Performs the IDF part in TF-IDF idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features1", minDocFreq=5) # Appends output Token-Stopwords-HashingTF-IDF with output of Vader assembler = VectorAssembler(inputCols=["features1", "vader"], outputCol="features") # Initialize Logistic Regression lr = LogisticRegression(maxIter=10, regParam=0.001) # Creates pipeline pipeline = Pipeline( stages=[tokenizer, stopword, hashingTF, idf, assembler, lr]) ############################################################################################### # Fit model to training set
# Pour les opérations de traitement du langage, il est d'usage de normaliser (L2) # les vecteurs de features : c'est ce qui marche le mieux apparemment. from pyspark.ml.feature import Normalizer normalizerUni = Normalizer(inputCol='words', outputCol='normWords', p=2.0) normalizerBi = Normalizer(inputCol="bigrams", outputCol='normBigrams', p=2.0) dfNorm = normalizerUni.transform(dfVect2) dfNorm2 = normalizerBi.transform(dfNorm) print "DataFrame(bi-gram): normalisé" dfNorm2.select('words', 'normWords').show() # La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements # non nuls et pas leur valeur # On passe au TFIDF # Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs # à n'importz quelle colonne (bigrammes, avec stop words ou sans...) from pyspark.ml.feature import HashingTF htf = HashingTF(inputCol='words', outputCol='wordsTF', numFeatures=10000) dfTrainTF = htf.transform(dfTrainTokNoSw) # INverse doc frequency from pyspark.ml.feature import IDF idf = IDF(inputCol=htf.getOutputCol(), outputCol="wordsTFIDF") idfModel = idf.fit(dfTrainTF) dfTrainTFIDF = idfModel.transform(dfTrainTF) dfTrainTFIDF.select('review', 'wordsTF', 'wordsTFIDF').show() # Je sais que cette étape m'a été utile une fois, la ça a pas trop l'air from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(dfTrainTFIDF) dfTrainFinal = string_indexer_model.transform(dfTrainTFIDF) dfTrainFinal.select('review', 'label', 'target_indexed').show()
data,Y=lf.loadLabeled("./data/train") labeledData = zip(data,[y.item() for y in Y]) labeledRdd = sc.parallelize(labeledData) def cleanLower(doc): return doc.replace("<br /><br />"," ").lower() rdd = labeledRdd.map(lambda doc : (cleanLower(doc[0]),doc[1])) print "Text is cleaned" sqlContext = SQLContext(sc) dfTrain = sqlContext.createDataFrame(rdd, ['review', 'label']) tokenizerNoSw = tr.NLTKWordPunctTokenizer( inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words('english'))) hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf') idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10) pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt]) model = pipeline.fit(dfTrain) print "The model is fitted"
# COMMAND ---------- # MAGIC %md # MAGIC ### Prepare the Pipeline # MAGIC For compataibility with Azure Model Management, make sure you are training the model on a cluster with Spark less than 2.3.0 since Model Management runs on Spark 2.1.1 and the Linear Regression model has a new param (epsilon) added in 2.3.0. # COMMAND ---------- tkn = Tokenizer().setInputCol("abstract").setOutputCol("tokens") englishStopWords = StopWordsRemover.loadDefaultStopWords("english") stops = StopWordsRemover().setStopWords(englishStopWords).setInputCol( "tokens").setOutputCol("tokens_no_stop") tf = HashingTF().setInputCol("tokens_no_stop").setOutputCol( "TFOut").setNumFeatures(1000) idf = IDF().setInputCol("TFOut").setOutputCol("IDFOut").setMinDocFreq(1) assem = VectorAssembler().setInputCols(["TFOut"]).setOutputCol("features") rename = SQLTransformer().setStatement( "SELECT features, amt as label FROM __THIS__") reg = LinearRegression() pipe = Pipeline().setStages([tkn, stops, tf, idf, assem, rename, reg]) # COMMAND ---------- # MAGIC %md # MAGIC ### Fit the Pipeline # COMMAND ----------
# ## Learning pipeline # In[8]: from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StopWordsRemover from pyspark.ml.pipeline import Pipeline from pyspark.ml.classification import LogisticRegression tokenizer = Tokenizer(inputCol='summary', outputCol='words') pipeline = Pipeline(stages=[ tokenizer, StopWordsRemover(inputCol='words', outputCol='filtered_words'), HashingTF(inputCol='filtered_words', outputCol='rawFeatures', numFeatures=120000), IDF(inputCol='rawFeatures', outputCol='features'), LogisticRegression(regParam=.3, elasticNetParam=.01) ]) # ## Testing the model accuracy # In[9]: model = pipeline.fit(train_reviews) # In[10]: from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator import sys import string spark = SparkSession.builder\ .appName("datasetTraining")\ .getOrCreate() spark.sparkContext.setLogLevel("ERROR") dataset = spark.read.csv('/bherr006/datasetTraining/training.1600000.processed.noemoticon.csv', header=False, inferSchema=True) (trainSet, valSet, testSet) = dataset.randomSplit([0.98, 0.01, 0.01], seed = 2000) tokenizer = Tokenizer(inputCol="_c5", outputCol="words") hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol="tf") idf = IDF(inputCol="tf", outputCol="features", minDocFreq=5) labelStringIndex = StringIndexer(inputCol="_c0", outputCol = "label") pipeline = Pipeline(stages=[tokenizer,hashtf,idf,labelStringIndex]) pipelineFit = pipeline.fit(trainSet) trainDf = pipelineFit.transform(trainSet) valDf = pipelineFit.transform(valSet) lr = LogisticRegression(maxIter=100) lrModel = lr.fit(trainDf) predictions = lrModel.transform(valDf) evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") print(evaluator.evaluate(predictions))
# tokenizer to create a "terms" column so for example: # from content=u'We start learning Spark' we have terms=[u'we', u'start', u'learning', u'spark'] tokenizer = Tokenizer(inputCol="content", outputCol="terms") termsData = tokenizer.transform(data) # remover to remove stop words that don't contribute so for example # from terms=[u'we', u'start', u'learning', u'spark'] we have filtered=[u'start', u'learning', u'spark'] remover = StopWordsRemover(inputCol="terms", outputCol="filtered") filteredTermsData = remover.transform(termsData) # http://spark.apache.org/docs/latest/ml-features.html # Both HashingTF and CountVectorizer can be used to generate the term frequency vectors. # HashingTF is a Transformer which takes sets of terms and converts those sets into fixed-length feature vectors. In text processing, a “set of terms” might # be a bag of words. HashingTF utilizes the hashing trick. # so from filtered=[u'start', u'learning', u'spark'] we have rawFeatures=SparseVector(262144, {29470: 1.0, 62173: 1.0, 181346: 1.0}) tf = HashingTF(inputCol="filtered", outputCol="rawFeatures").transform(filteredTermsData) # IDF: IDF is an Estimator which is fit on a dataset and produces an IDFModel. The IDFModel takes feature vectors (generally created from HashingTF or # CountVectorizer) and scales each column. Intuitively, it down-weights columns which appear frequently in a corpus. idf = IDF(inputCol="rawFeatures", outputCol="features").fit(tf) # TF-IDF tfidf = idf.transform(tf) labels = data.map(lambda doc: doc["label"] # Standard Python dict access ) # Training and Test datasets # Here feature#5 contains the data for training, for example # [Row(label=0.0, content=u'We start learning Spark', terms=[u'we', u'start', u'learning', u'spark'], filtered=[u'start', u'learning', u'spark'], # rawFeatures=SparseVector(262144, {29470: 1.0, 62173: 1.0, 181346: 1.0}), features=SparseVector(262144, {29470: 0.9163, 62173: 0.9163, 181346: 0.9163}))]
# In[14]: # filter rows where n_killed > 2 notes_length_df = notes_length_df.filter(notes_length_df.label <= 2) # In[15]: # create features tokenizer = Tokenizer(inputCol="notes", outputCol="token_notes") stopremove = StopWordsRemover(inputCol='token_notes',outputCol='stop_tokens') hashingTF = HashingTF(inputCol="token_notes", outputCol='hash_token') idf = IDF(inputCol='hash_token', outputCol='idf_token') # In[16]: # Create feature vectors clean_up = VectorAssembler(inputCols=['idf_token', 'notes_length'], outputCol='features') # In[17]: # Create and run a data processing Pipeline data_prep_pipeline = Pipeline(stages=[tokenizer, stopremove, hashingTF, idf, clean_up])
docs = docs.union( next_docs.map(lambda (doc, lines): (format_text(lines), float(curr_cat)))) curr_cat += 1 training_rows = docs.sample(False, train_fraction) testing_rows = docs.subtract(training_rows) # Prepare training and test documents, which are labeled. LabeledDocument = Row("text", "label") train = training_rows.map(lambda x: LabeledDocument(*x)).toDF() test = testing_rows.map(lambda x: LabeledDocument(*x)).toDF() # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures") #outputCol="features") idf = IDF(inputCol="rawFeatures", outputCol="features") lr = LogisticRegression(maxIter=1000, regParam=0.001) #pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) p0 = Pipeline(stages=[tokenizer, hashingTF, idf, lr]) #m0 = p0.fit(train) #pipeline = Pipeline(stages=[m0, lr]) pipeline = p0 # Fit the pipeline to training documents. model = pipeline.fit(train) print('\n\n --------------- RESULT ----------------------\n\n') print(model.transform(test).head()) print('\n\n ---------------------------------------------\n\n')
# load data df0 = spark.read.csv("./jobs_clean.csv", header=True, multiLine=True, inferSchema=True) df1 = pd.read_csv('./jobs_clean.csv') #df0.show() print('The number of jobs:',df0.count()) print('\nthe distinct jobs name: ', df1.job.unique()) print('\nThere are', len(df1.job.unique())-1, 'different kinds of jobs in the table.') # split the desc field tokenizer = Tokenizer(inputCol='desc_clean', outputCol='desc_words') df = tokenizer.transform(df0) #df.show() #df.select('desc_words').show(10) # compute TF-IDF hashingTF = HashingTF(inputCol='desc_words', outputCol='desc_words_tf') tf = hashingTF.transform(df).cache() idf = IDF(inputCol='desc_words_tf', outputCol='desc_words_tfidf').fit(tf) tfidf = idf.transform(tf).cache() #print('tfidf for each job:', tfidf.select('desc_words_tfidf').show(10,truncate=False)) # data normalization from pyspark.ml.feature import Normalizer normalizer = Normalizer(inputCol="desc_words_tfidf", outputCol="norm") tfidf = normalizer.transform(tfidf) #tfidf.select("id", "norm").show(6) # compute similarity between jobs and resume import pyspark.sql.functions as psf from pyspark.sql.types import DoubleType print('\nCompute the similarity between jobs and resume...')
def news_classifier(): data = spark.read.option("mode", "DROPMALFORMED").load("/news_data.csv", format="csv", header="true", inferSchema='true') data.first() data.printSchema() #There is a field in the data called constituent_id, which is basically the company which the news headline is about. We want to drop that column from our data. drop_list = ['constituent_id'] data = data.select([column for column in data.columns if column not in drop_list]) data.show(5) data.printSchema() # regular expression tokenizer regexTokenizer = RegexTokenizer(inputCol="news_title", outputCol="words", pattern="\\W") # remove stop words stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered") #compute bigrams ngram = NGram(n=2, inputCol="filtered", outputCol="ngrams") # Add HashingTF and IDF to transformation hashingTF = HashingTF(inputCol="ngrams", outputCol="rawFeatures", numFeatures=10000) idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms #string indexer label_stringIdx = StringIndexer(inputCol = "weekly_returns", outputCol = "label") #create processing pipeline pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, ngram, hashingTF, idf, label_stringIdx]) # Fit the pipeline to training data. pipelineFit = pipeline.fit(data) dataset = pipelineFit.transform(data) dataset.show(5) # Randomly split data into training and test sets. set seed for reproducibility (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100) print("Training Dataset Count: " + str(trainingData.count())) print("Test Dataset Count: " + str(testData.count())) # Build a Logistic Regression model lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0, family="multinomial") # Train model with Training Data lrModel = lr.fit(trainingData) predictions = lrModel.transform(testData) predictions.filter(predictions['prediction'] == 0) \ .select("news_title","weekly_returns","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) #multiclass evaluator from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") print(evaluator.evaluate(predictions)) #save predictions to csv predictions = predictions.select("news_title", "weekly_returns", "prediction") predictions.write.format("csv").save("/Desktop/predictions-spark.csv") #save machine learning model model_path = "/Desktop/Spark_Model" lrModel.save(model_path) #load model again, to make sure it works ml_model = lrModel.load(model_path) predictions2 = ml_model.transform(testData) #make predictions with loaded model predictions2.filter(predictions2['prediction'] == 0) \ .select("news_title","weekly_returns","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) #end spark session spark.stop()
.master("local") \ .appName("Compare Multiclass Models") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() numfeatures=2000 numclasses = 19 # Load news category data raw_data = sc.textFile("data/news_sections_abstract2016.txt") lines = raw_data.map(lambda line: line.split(" ")).map(lambda line: (line[0]," ".join(line[1:]))) sentenceData = spark.createDataFrame(lines,["label", "sentence"]) # Map sentence data to hashingTF from pyspark.ml.feature import HashingTF, IDF, Tokenizer tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=numfeatures) featurizedData = hashingTF.transform(wordsData) #featurizedData.show() # Map string labels to integer df = featurizedData.select('label','features') data0 = df.replace(['World','Sports','Fashion & Style','Books','Music', \ 'Television','Movies','Technology','Science','Food','Real Estate','Theater', \ 'Health','Travel','Education','Your Money','Politics','Economy','Art & Design'] \ ,['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','0'],'label') category = ['Art & Design','World','Sports','Fashion & Style','Books','Music', \ 'Television','Movies','Technology','Science','Food','Real Estate','Theater', \ 'Health','Travel','Education','Your Money','Politics','Economy'] dictionary = {'Art & Design':0,'World':1,'Sports':2,'Fashion & Style':3,'Books':4,'Music':5, \ 'Television':6,'Movies':7,'Technology':8,'Science':9,'Food':10,'Real Estate':11,'Theater':12, \
from environment import spark from pyspark.ml.feature import HashingTF, IDF, Tokenizer sentenceData = spark.createDataFrame( [(0.0, "Hi I I I heard about Spark"), (0.0, "I wish Java could use case classes"), (1.0, "Logistic regression models are neat")], ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=100) featurizedData = hashingTF.transform(wordsData) featurizedData.show(truncate=False) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show(truncate=False)
def main(): time_start = time.time() data = 'train_review.json' # sys.argv[1] sc = elly_func.start_spark('Final_Project') # total pairs = 1029758 textRDD = sc.textFile(data).map( elly_func.tojson).map(lambda x: ((x['user_id'], x['business_id']), x[ 'text'])).reduceByKey(lambda a, b: a + b).mapValues( remove_blank).map(lambda x: (x[0][0], x[0][1], x[1])) # Create DataFrame tableA = spark_session(textRDD).createDataFrame( textRDD, ['user_id', 'business_id', 'text']) # Remove stopwords remover = StopWordsRemover(inputCol="text", outputCol="filtered") # text不用了 拉掉 df = remover.transform(tableA).drop('text') # 轉換成tokenizer要的形式 string test = df.withColumn("sentence", df["filtered"].cast("string")) # 將句子篩選有用字詞 tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(test) # TF-IDF 高頻200字篩選 hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=200) featurizedData = hashingTF.transform(wordsData) # 字詞向量化 idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) # user與business與字詞關係 按字邏輯排列 user_profile = rescaledData.select('user_id', 'business_id', 'features').orderBy("user_id") business_profile = rescaledData.select('business_id', 'features').orderBy("business_id") def set2list(x): temp = [] for i in x: temp.append(i) return temp # user與business與字詞關係 字典建立 # 91730 # Total number of [('business_id', ['case','eat',...]),...] business_dic = business_profile.rdd.map(lambda x: (x[0], list(x[ 1].indices))).reduceByKey(lambda a, b: a + b).mapValues( lambda x: set2list(set(x))).collectAsMap() # 13167 user_dic = user_profile.rdd.map(lambda x: (x[0], list(x[ 2].indices))).reduceByKey(lambda a, b: a + b).mapValues( lambda x: set2list(set(x))).collectAsMap() # user_profile_dic = {'user1': [word2, word8, word24,....],.....} user_bus = sc.textFile(data).map( elly_func.tojson).map(lambda x: ((x['user_id'], x['business_id'])) ).reduceByKey(lambda a, b: a + b).collectAsMap() # 差cos sim time_end = time.time() print('Duration:', time_end - time_start)
def test_gen_estimator_metadata(spark_session): # pylint: disable=unused-argument tokenizer1 = Tokenizer(inputCol="text1", outputCol="words1") hashingTF1 = HashingTF(inputCol=tokenizer1.getOutputCol(), outputCol="features1") tokenizer2 = Tokenizer(inputCol="text2", outputCol="words2") hashingTF2 = HashingTF(inputCol=tokenizer2.getOutputCol(), outputCol="features2") vecAssembler = VectorAssembler(inputCols=["features1", "features2"], outputCol="features") lor = LogisticRegression(maxIter=10) ova = OneVsRest(classifier=lor) sub_pipeline1 = Pipeline(stages=[tokenizer1, hashingTF1]) sub_pipeline2 = Pipeline(stages=[tokenizer2, hashingTF2]) sub_pipeline3 = Pipeline(stages=[vecAssembler, ova]) paramGrid = (ParamGridBuilder().addGrid(lor.maxIter, [10, 20]).addGrid( lor.regParam, [0.1, 0.01]).build()) eva = MulticlassClassificationEvaluator() crossval = CrossValidator(estimator=sub_pipeline3, estimatorParamMaps=paramGrid, evaluator=eva, numFolds=2) top_pipeline = Pipeline(stages=[sub_pipeline1, sub_pipeline2, crossval]) metadata = _gen_estimator_metadata(top_pipeline) expected_hierarchy = { "name": "Pipeline_1", "stages": [ { "name": "Pipeline_2", "stages": [{ "name": "Tokenizer_1" }, { "name": "HashingTF_1" }] }, { "name": "Pipeline_3", "stages": [{ "name": "Tokenizer_2" }, { "name": "HashingTF_2" }] }, { "name": "CrossValidator", "evaluator": { "name": "MulticlassClassificationEvaluator" }, "tuned_estimator": { "name": "Pipeline_4", "stages": [ { "name": "VectorAssembler" }, { "name": "OneVsRest", "classifier": { "name": "LogisticRegression" } }, ], }, }, ], } assert metadata.hierarchy == expected_hierarchy assert metadata.uid_to_indexed_name_map == { top_pipeline.uid: "Pipeline_1", sub_pipeline1.uid: "Pipeline_2", tokenizer1.uid: "Tokenizer_1", hashingTF1.uid: "HashingTF_1", sub_pipeline2.uid: "Pipeline_3", tokenizer2.uid: "Tokenizer_2", hashingTF2.uid: "HashingTF_2", crossval.uid: "CrossValidator", sub_pipeline3.uid: "Pipeline_4", vecAssembler.uid: "VectorAssembler", ova.uid: "OneVsRest", lor.uid: "LogisticRegression", eva.uid: "MulticlassClassificationEvaluator", } assert (metadata.uid_to_indexed_name_map[ metadata.param_search_estimators[0].uid] == "CrossValidator")
train = train.withColumn("comment_text", stemmer_udf("comment_text")) def check_clean(toxic, severe_toxic, obscene, threat, insult, identity_hate): if (toxic + severe_toxic + obscene + threat + insult + identity_hate) > 0: return 0 else: return 1 mergeCols = udf(lambda toxic, severe_toxic, obscene, threat, insult, identity_hate: check_clean(toxic, severe_toxic, obscene, threat, insult, identity_hate), IntegerType()) train = train.withColumn("clean", mergeCols(train["toxic"], train["severe_toxic"], train["obscene"], train["threat"], train["insult"], train["identity_hate"])) tokenizer = Tokenizer().setInputCol("comment_text").setOutputCol("words") remover= StopWordsRemover().setInputCol("words").setOutputCol("filtered").setCaseSensitive(False) hashingTF = HashingTF().setNumFeatures(1000).setInputCol("filtered").setOutputCol("rawFeatures") idf = IDF().setInputCol("rawFeatures").setOutputCol("features").setMinDocFreq(0) nb = NaiveBayes(labelCol="label", featuresCol="features") pipeline=Pipeline(stages=[tokenizer, remover, hashingTF, idf, nb]) train = train.drop('toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate') train = train.withColumnRenamed("clean", "label") training_spark_df_binary, testing_spark_df_binary = train.randomSplit([0.8, 0.2], seed = 2018) paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures,[1000]) \ .addGrid(nb.smoothing, [1]) \ .build()
conf = SparkConf() conf.setAppName( "part2_uni" ) conf.set("spark.executor.memory", "2g") sc = SparkContext(conf = conf) #reading input lines =sc.wholeTextFiles("/cosc6339_s17/books-longlist/") #configuring SparkSession spark=SparkSession(sc) hasattr(lines, "toDF") #tokeinizing the words and converting into dataframes tokenize=lines.map(part2).toDF(["bookname", "words"]) #converting into unigrams unigram = NGram(n=1, inputCol = "words", outputCol = "unigrams") unigramdataframe = unigram.transform(tokenize) #finding the tf value hashingTF = HashingTF(inputCol = "unigrams", outputCol = "unigram-tf") tf = hashingTF.transform(unigramdataframe) #finding the idf value idf = IDF(inputCol = "unigram-tf", outputCol = "unigram-tf-idf") idfModel = idf.fit(tf) tfidfignore = idfModel.transform(tf) #saving the output tfidfignore.rdd.saveAsTextFile("/bigd12/output2_1")
test.cache() regexTokenizer = RegexTokenizer(gaps=False, pattern="\\w+", inputCol="name", outputCol="name_parts", toLowercase=True) stopWords = ["mr", "mrs", "miss", "master", "jr", "j", "c", "d"] remover = StopWordsRemover(inputCol="name_parts", outputCol="filtered_name_parts", stopWords=stopWords) hashingTF = HashingTF(numFeatures=1000, inputCol="filtered_name_parts", outputCol="text_features") sexIndexer = StringIndexer(inputCol="sex", outputCol="sexIndexed", handleInvalid="keep") embarkedIndexer = StringIndexer(inputCol="embarked", outputCol="embarkedIndexed", handleInvalid="keep") imputer = Imputer(strategy="mean", inputCols=[ "pclass", "sibsp", "parch", "sexIndexed", "embarkedIndexed", "age", "fare" ],