def fit_kmeans(spark, products_df): step = 0 step += 1 tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer") step += 1 stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords") step += 1 tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16) step += 1 idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf") step += 1 normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer") step += 1 kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20) kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans]) model = kmeans_pipeline.fit(products_df) words_prediction = model.transform(products_df) model.save("./kmeans") # the whole machine learning instance is saved in a folder return model, words_prediction
def tf_idf_feature(wordsData): hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) for features_label in rescaledData.select("features", "id").take(3): print(features_label)
def textPredict(request): """6.文本聚类,热度预测""" label = request.POST['label'] title = request.POST['title'] conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) """处理数据集,生成特征向量""" dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet') print(dfTitles.dtypes) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(dfTitles) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show() for features_label in rescaledData.select("features", "rawFeatures").take(3): print(features_label) """决策树模型培训""" labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData) featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData) (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3]) dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) model = pipeline.fit(trainingData) """模型测试""" predictions = model.transform(testData) predictions.show() predictions.select("prediction", "indexedLabel", "features").show(5) """用户数据测试,单个新闻测试""" sentenceData = sqlContext.createDataFrame([ (label,title), ],['label',"title"]) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) rescaledData = idfModel.transform(featurizedData) myprediction = model.transform(rescaledData) print("==================================================") myprediction.show() resultList = convertDfToList(myprediction) """模型评估""" evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print("Test Error = %g " % (1.0 - accuracy)) treeModel = model.stages[2] print(treeModel) sc.stop() return render(request,{'resultList':resultList})
def tfidf(dataframe, in_col1, out_col1, in_col2, out_col2, n): global idfModel hashingTF = HashingTF(inputCol=in_col1, outputCol=out_col1, numFeatures=n) featurizedData = hashingTF.transform(dataframe) idf = IDF(inputCol=in_col2, outputCol=out_col2) idfModel = idf.fit(featurizedData) dataframe = idfModel.transform(featurizedData) return dataframe
def run_tf_idf_spark_ml(df, numFeatures=1 << 20): tokenizer = Tokenizer(inputCol="body", outputCol="words") wordsData = tokenizer.transform(df) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numFeatures) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) return idfModel.transform(featurizedData)
def extract_idf_features(p_df, input_col, output_col): """ Extracts IDF features. :param p_df: A DataFrame. :param in_column: Name of the input column. :param out_column: Name of the output column. :return: A DataFrame. """ idf = IDF(inputCol=input_col, outputCol=output_col) idfModel = idf.fit(p_df) return idfModel.transform(p_df)
def create_features(raw_data): #Create DataFrame data_df = sqlContext.createDataFrame(raw_data.map(lambda r : Row(appid=r[0], price=r[1], sentence=r[2]))) #Transform sentence into words tokenizer = Tokenizer(inputCol='sentence', outputCol='words') words_df = tokenizer.transform(data_df) #Calculate term frequency hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=5) featurized_df = hashingTF.transform(words_df) #Calculate inverse document frequency idf = IDF(inputCol='rawFeatures', outputCol='features') idfModel = idf.fit(featurized_df) return idfModel.transform(featurized_df)
def tf_feature_vectorizer(df,no_of_features,ip_col): #from pyspark.sql.functions import udf #from pyspark.sql.types import * output_raw_col = ip_col+"raw_features" output_col = ip_col+"features" hashingTF = HashingTF(inputCol=ip_col, outputCol=output_raw_col, numFeatures=no_of_features) featurizedData = hashingTF.transform(df) idf = IDF(inputCol=output_raw_col, outputCol=output_col) idfModel = idf.fit(featurizedData) rescaled_data = idfModel.transform(featurizedData) rescaled_data.show(5) print(rescaled_data.count()) return rescaled_data
def makeTFIDF(sc, spark, reviews): # count vectorizer and tfidf # cv = CountVectorizer(inputCol='words_clean', outputCol='tf') # cvModel = cv.fit(reviews) # reviews = cvModel.transform(reviews) # HashingTF for fewer dimensions: hashingtf = HashingTF(inputCol='words_clean', outputCol='tf', numFeatures=1000) reviews = hashingtf.transform(reviews) # create TF-IDF matrix idf = IDF().setInputCol('tf').setOutputCol('tfidf') tfidfModel = idf.fit(reviews) reviews = tfidfModel.transform(reviews)
def test_idf(self): dataset = self.spark.createDataFrame([ (DenseVector([1.0, 2.0]),), (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], ["tf"]) idf0 = IDF(inputCol="tf") self.assertListEqual(idf0.params, [idf0.inputCol, idf0.minDocFreq, idf0.outputCol]) idf0m = idf0.fit(dataset, {idf0.outputCol: "idf"}) self.assertEqual(idf0m.uid, idf0.uid, "Model should inherit the UID from its parent estimator.") output = idf0m.transform(dataset) self.assertIsNotNone(output.head().idf) # Test that parameters transferred to Python Model check_params(self, idf0m)
def tf_idf(df, column): """ Compute TF-IDF of a corpus. Transformation: array<string> --> vector """ df = preprocess(df, column) # text to list of terms (df, voc) = count(df, column) # creates a TF-IDF model and uses it to compute the feature vector. idf = IDF(inputCol=column, outputCol='_'+column) model = idf.fit(df) df = model.transform(df) df = replace(df, column, '_'+column) return (df, voc)
def append_tf_idf(self, df): """ Calculate term frequency and inverse document frequency based on at least 1 visit hourly in this case. Compares how often the tokens appeared at least once per hour compared to other tokens. Not used for the main purpose of the project. Args: :param df: Dataframe parameter. Returns: :return: Dataframe with term frequency and inverse document frequency added in the columns 'rawFeatures' and 'features' respectively. """ #Create TF column. hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=100000) tf = hashingTF.transform(df) tf.persist(StorageLevel.MEMORY_AND_DISK) #Create IDF column. idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(tf) tfidf = idfModel.transform(tf) return tfidf
def get_top_words(dataset, signatures): # TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer # Or translate comments in other languages using the free Microsoft Translate API. sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1))) if sentenceData.rdd.isEmpty(): return dict() # Tokenize comments. tokenizer = Tokenizer(inputCol='user_comments', outputCol='words') wordsData = tokenizer.transform(sentenceData) # Remove duplicate words from comments. wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words']) if wordsData.rdd.isEmpty(): print("[WARNING]: wordsData is empty, sentenceData wasn't.") return dict() # Clean comment words by removing puntuaction and stemming. def clean_word(w): return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower())) wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words']) # XXX: Useless with TF-IDF? remover = StopWordsRemover(inputCol='words', outputCol='filtered') cleanWordsData = remover.transform(wordsData) cv = CountVectorizer(inputCol='filtered', outputCol='features') model = cv.fit(cleanWordsData) featurizedData = model.transform(cleanWordsData) idf = IDF(inputCol='features', outputCol='tfidf_features') idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect() return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
return doc.replace("<br /><br />"," ").lower() rdd = labeledRdd.map(lambda doc : (cleanLower(doc[0]),doc[1])) print "Text is cleaned" sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ['review', 'label']) dfTrain, dfTest = df.randomSplit([0.8,0.2]) print "Random split is done" tokenizerNoSw = tr.NLTKWordPunctTokenizer( inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words('english'))) hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf') idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') dt = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(),maxIter=30, regParam=0.01) pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt]) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') # grid=(ParamGridBuilder() # .baseOn([evaluator.metricName,'precision']) # .addGrid(dt.maxDepth, [10,20]) # .build())
pattern="[a-zA-Z]+") ## Remove ignored words stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) gbm = H2OGBM(splitRatio=0.8, seed=1, featuresCols=[idf.getOutputCol()], labelCol="label") dl = H2ODeepLearning(epochs=10, seed=1, l1=0.001, l2=0.0, hidden=[200, 200], featuresCols=[idf.getOutputCol()], labelCol="label")
from test_df """ test_df = spark.sql(query) test_df = test_df.withColumn('id', F.col('id') - 1) test_df.show(5) ######################################################################################################## # Build pipeline and run indexer = StringIndexer(inputCol="category", outputCol="label") tokenizer = RegexTokenizer(pattern=u'\W+', inputCol="text", outputCol="words", toLowercase=False) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") idf = IDF(inputCol="rawFeatures", outputCol="features") lr = LogisticRegression(maxIter=20, regParam=0.001) # Builing model pipeline pipeline = Pipeline(stages=[indexer, tokenizer, hashingTF, idf, lr]) # Train model on training set model = pipeline.fit( train_df ) #if you give new names to your indexed datasets, make sure to make adjustments here # Model prediction on test set pred = model.transform(test_df) # ...and here # Model prediction accuracy (F1-score) pl = pred.select("label", "prediction").rdd.cache()
# creating tokens/words from the sentence data tokenizer = Tokenizer(inputCol="document", outputCol="words") wordsData = tokenizer.transform(documentData) print (documentData) wordsData.show() """**a.Performing a task without NLP**""" # applying tf on the words data hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=200) tf = hashingTF.transform(wordsData) # alternatively, CountVectorizer can also be used to get term frequency vectors # calculating the IDF tf.cache() idf = IDF(inputCol="rawFeatures", outputCol="features") idf = idf.fit(tf) tfidf = idf.transform(tf) #displaying the results tfidf.select("label", "features").show() print("TF-IDF without NLP:") for each in tfidf.collect(): print(each) print(each['rawFeatures']) spark.stop() """**b.Performing the task with lemmitization**""" import nltk;
from pyspark.ml.feature import Tokenizer sentences_df = spark.createDataFrame( [(1, 'This is an introduction to Spark MLlib'), (2, 'Mllib includes libraries for classification and regression'), (3, 'It also contains supporting tools for pipelines')], ['id', 'sentences']) sentences_df.show() sent_token = Tokenizer(inputCol='sentence', outputCol='words') sent_tokenized_df = sent_token.transform(sentences_df) sent_tokenized_df.show() from pyspark.ml.feature import HashingTF, IDF sentences_df.take(1) sent_tokenized_df.take(1) hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=20) sent_hfTF_df = hashingTF.transform(sent_tokenized_df) sent_hfTF_df.take(1) idf = IDF(inputCol='rawFeatures', outputCol='idf_features') idfModel = idf.fit(sent_hfTF_df) tfidf_df = idfModel.transform(sent_hfTF_df) tfidf_df.take(1)
remover.setStopWords(sw) cleanDataTrain = remover.transform(wordsDataTrain) cleanDataTest = remover.transform(wordsDataTest) # Made onegrams onegram = NGram(n=1, inputCol="filtered",outputCol="onegram") onegramedDataTrain=onegram.transform(cleanDataTrain) onegramedDataTest = onegram.transform(cleanDataTest) # Find hashed Term frequency value of word vector hashingTF = HashingTF(inputCol="onegram", outputCol="rawFeatures", numFeatures=100000) featurizedDataTrain = hashingTF.transform(onegramedDataTrain) featurizedDataTest = hashingTF.transform(onegramedDataTest) # Find IDF idf = IDF(inputCol="rawFeatures", outputCol="features") idfModelTrain = idf.fit(featurizedDataTrain) idfModelTest = idf.fit(featurizedDataTest) rescaledDataTrain = idfModelTrain.transform(featurizedDataTrain) rescaledDataTest = idfModelTest.transform(featurizedDataTest) # Final test and train data train_data = rescaledDataTrain.select('features', 'label') test_data = rescaledDataTest.select('features', 'label') # Multinomial naivebyes nb = NaiveBayes(smoothing=1.0, modelType="multinomial") model = nb.fit(train_data) result = model.transform(test_data) result.show()
def main(sc, sqlContext): start = timer() stpwrds = stopwords.words('english') tbl_translate = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('S') or unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('N')) print '---Pegando produtos---' start_i = timer() productRDD = sc.parallelize(findProductsByCategory([])) print '####levou %d segundos' % (timer()-start_i) print '---Criando corpus---' start_i = timer() corpusRDD = (productRDD.map(lambda s: (s[0], word_tokenize(s[1].translate(tbl_translate).lower()), s[2], s[3])) .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3] )) .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP'], s[2], s[3])) .cache()) print '####levou %d segundos' % (timer()-start_i) print '---Pegando e persistindo dados de categoria e tokens---' start_i = timer() tokens = corpusRDD.flatMap(lambda x: x[1]).distinct().collect() numTokens = len(tokens) category = productRDD.map(lambda x: x[2]).distinct().collect() categoryAndSubcategory = productRDD.map(lambda x: (x[2], x[3])).distinct().collect() insertTokensAndCategories(tokens, category, categoryAndSubcategory) print '####levou %d segundos' % (timer()-start_i) print '---Calculando TF-IDF dos produtos---' start_i = timer() wordsData = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], category=s[2], subcategory=s[3])) #persistir isso para que ele nao tenha que fazer de novo na predicaoo wordsDataDF = sqlContext.createDataFrame(wordsData) #persistindo para a predicao wordsDataForPrediction = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], type=s[2])) #persistir isso para que ele nao tenha que fazer de novo na predicaoo wordsDataForPredictionDF = sqlContext.createDataFrame(wordsDataForPrediction) if os.path.exists("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"): shutil.rmtree("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet") wordsDataForPredictionDF.write.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet") hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens) idf = IDF(inputCol="rawFeatures", outputCol="features") featurizedData = hashingTF.transform(wordsDataDF) idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) #VSM = rescaledData.map(lambda t: LabeledPoint(categoryAndSubcategory.index((t.category, t.subcategory)), t.features)) VSM = rescaledData.map(lambda t: LabeledPoint(category.index(t.category), t.features)) VSMTrain, VSMTest = VSM.randomSplit([8, 2], seed=0L) print '####levou %d segundos' % (timer()-start_i) print '--Criando modelo Naive Bayes---' start_i = timer() model = NaiveBayes.train(VSMTrain) if os.path.exists("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria"): shutil.rmtree("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria") model.save(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria') print '####levou %d segundos' % (timer()-start_i) print '---Testando modelo Naive Bayes---' start_i = timer() prediction = VSMTest.map(lambda p : (categoryAndSubcategory[int(model.predict(p.features))], categoryAndSubcategory[int(p.label)])) acuraccy = float(prediction.filter(lambda (x, v): x[0]==v[0]).count())/float(prediction.count()) print 'acuracidade de %f' % acuraccy print '####levou %d segundos' % (timer()-start_i) print '---Pegando os posts---' start_i = timer() posts = list() wb = load_workbook(filename = '/home/ubuntu/recsys-tcc-ml/base_sentimentos.xlsx') sheet = wb['Menes'] for row in sheet.iter_rows(row_offset=1): post = list() for cell in row: if cell.value is None: break post.append(1 if cell.value == 'Positive' or cell.value == 'Neutral' else 0 if cell.value == 'Negative' else removeAccents(cell.value)) if len(post) > 0: posts.append(tuple(post)) print '####levou %d segundos' % (timer()-start_i) print '---Criando corpus---' start_i = timer() postsRDD = sc.parallelize(posts) postCorpusRDD = (postsRDD.map(lambda s: (s[1], word_tokenize(s[0].translate(tbl_translate).lower()))) .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds])) .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP'])) .cache()) print '####levou %d segundos' % (timer()-start_i) print '---Calculando TF-IDF dos Posts---' start_i = timer() wordsData = postCorpusRDD.map(lambda s: Row(label=s[0], words=s[1])) wordsDataDF = sqlContext.createDataFrame(wordsData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens) idf = IDF(inputCol="rawFeatures", outputCol="features") featurizedData = hashingTF.transform(wordsDataDF) idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) VSM = rescaledData.map(lambda t: LabeledPoint(t.label, t.features)) VSMTrain, VSMTest = VSM.randomSplit([8, 2], seed=0L) print '####levou %d segundos' % (timer()-start_i) print '--Criando modelo SVM---' start_i = timer() model = SVMWithSGD.train(VSMTrain, iterations=100) if os.path.exists("/home/ubuntu/recsys-tcc-ml/models/svm"): shutil.rmtree("/home/ubuntu/recsys-tcc-ml/models/svm") model.save(sc, "/home/ubuntu/recsys-tcc-ml/models/svm") print '---Testando modelo SVM---' start_i = timer() prediction = VSMTest.map(lambda p: (p.label, model.predict(p.features))) acuraccy = prediction.filter(lambda (v, p): v != p).count() / float(prediction.count()) print 'acuracidade de %f' % acuraccy print '####levou %d segundos' % (timer()-start_i) print 'O processo todo levou %d segundos' % (timer()-start)
lrModel = lr.fit(trainingData) predictions = lrModel.transform(testData) predictions.filter(predictions['prediction'] == 0) \ .select("Descript","Category","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) # Evaluate the performance from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") evaluator.evaluate(predictions) # 0.9725282146509521 # 2.Logistic Regression using TF-IDF Features from pyspark.ml.feature import HashingTF, IDF # Add HashingTF and IDF to transformation hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000) idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms # Redo Pipeline pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx]) pipelineFit = pipeline.fit(data) dataset = pipelineFit.transform(data) # Randomly split data into training and test sets. set seed for reproducibility (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100) # Build the model lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0) # Train model with Training Data lrModel = lr.fit(trainingData) # Make predictions on Testing Data predictions = lrModel.transform(testData) predictions.filter(predictions['prediction'] == 0) \ .select("Descript","Category","probability","label","prediction") \ .orderBy("probability", ascending=False) \
#%% names_df = names_df.dropna(subset='name') names_df.show() #%% tokenizer = Tokenizer(inputCol="name", outputCol="words") wordsData = tokenizer.transform(names_df) wordsData.show() #%% stopwords = [] stopwords.extend(StopWordsRemover.loadDefaultStopWords('english')) remover = StopWordsRemover(inputCol="words", outputCol="cleanedWords", stopWords=stopwords) cleanedWordsData = remover.transform(wordsData) cleanedWordsData.show() #%% hashingTF = HashingTF(numFeatures=4096, inputCol="cleanedWords", outputCol="tfFeatures") tfWordsData = hashingTF.transform(cleanedWordsData) tfWordsData.show() #%% idf = IDF(inputCol="tfFeatures", outputCol="tfIdfFeatures") idfModel = idf.fit(tfWordsData) results = idfModel.transform(tfWordsData) results.show()
pipeline_model = pipeline.fit(train_data) test_predicted = pipeline_model.transform(test_data) #utilisation d'une pipeline avec tf-idf train_data_2 = load_dataframe_text_unsplitted("20ng-train-all-terms.txt") test_data_2 = load_dataframe_text_unsplitted("20ng-test-all-terms.txt") vectorizer = CountVectorizer(inputCol="words", outputCol="bag_of_words") tokenizer = Tokenizer(inputCol="sentence", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") idf = IDF(inputCol="rawFeatures", outputCol="bag_of_words") #pipeline_tf_idf = Pipeline(stages=[label_indexer, HashingTF, IDF, classifier]) pipeline_tf_idf = Pipeline( stages=[label_indexer, tokenizer, hashingTF, idf, classifier]) pipeline_model_tf_idf = pipeline_tf_idf.fit(train_data_2) test_predicted_tf_idf = pipeline_model_tf_idf.transform(test_data_2) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator( labelCol="label_index", predictionCol="label_index_predicted", metricName="accuracy") accuracy = evaluator.evaluate(test_predicted) print("Accuracy = {:.2f}".format(accuracy))
['product_uid', 'id', 'search_term_clean', 'relevance', 'text_clean']) # Step 1: split text field into words tokenizer = Tokenizer(inputCol="text_clean", outputCol="text_token") fulldata = tokenizer.transform(fulldata) print "Tokenized Text:" print fulldata.head() print "################" # Step 2: compute term frequencies hashingTF = HashingTF(inputCol="text_token", outputCol="tf", numFeatures=10000) fulldata = hashingTF.transform(fulldata) print "TERM frequencies:" print fulldata.head() print "################" # Step 3: compute inverse document frequencies idf = IDF(inputCol="tf", outputCol="tf_idf") idfModel = idf.fit(fulldata) fulldata = idfModel.transform(fulldata) print "IDF :" print fulldata.head() print "################" #OK we do the same for the search term # Step 1: split text field into words tokenizer = Tokenizer(inputCol="search_term_clean", outputCol="search_token") fulldata = tokenizer.transform(fulldata) print "Tokenized Search:" print fulldata.head() print "################" # Step 2: compute term frequencies hashingTF = HashingTF(inputCol="search_token",
#Computing setniment column based on rating sentiment = when(col("rating") <= 5, 0).otherwise(1) df = df.withColumn("sentiment", sentiment) df = df.withColumn('length', length(df['review'])) # %% [markdown] # ## Feature Transformation # %% from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer tokenizer = Tokenizer(inputCol="review", outputCol="token_text") stopremove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens') count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec') idf = IDF(inputCol="c_vec", outputCol="tf_idf") pos_neg = StringIndexer(inputCol='sentiment', outputCol='label') # %% from pyspark.ml.feature import VectorAssembler from pyspark.ml.linalg import Vector # %% clean_up = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features') # %% from pyspark.ml.classification import NaiveBayes from pyspark.ml.classification import LinearSVC from pyspark.ml.classification import LogisticRegression from pyspark.ml.classification import DecisionTreeClassifier
smsDf = sqlContext.createDataFrame(smsXformed, ["label", "message"]) smsDf.cache() smsDf.select("label", "message").show() #Split training and testing (trainingData, testData) = smsDf.randomSplit([0.9, 0.1]) trainingData.count() testData.count() testData.collect() #Setup pipeline from pyspark.ml.classification import NaiveBayes, NaiveBayesModel from pyspark.ml import Pipeline from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.ml.feature import IDF tokenizer = Tokenizer(inputCol="message", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), \ outputCol="tempfeatures") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") nbClassifier = NaiveBayes() pipeline = Pipeline(stages=[tokenizer, hashingTF, \ idf, nbClassifier]) nbModel = pipeline.fit(trainingData) prediction = nbModel.transform(testData) prediction.groupBy("label", "prediction").count().show()
#print(data.head(5)) ##creating rdd file sc = SparkContext("local", "app") sqc = SQLContext(sc) df = sqc.createDataFrame(data, ['type', 'text']) #NEW VARIABLE GENERATION dataCleaned = df.map(lambda x: (1 if x['type'] == 'spam' else 0, tokenize(x['text']))) dataClean = dataCleaned.map(lambda x: (float(x[0]), x[1])) dfClean = sqc.createDataFrame(dataClean, ['label', 'words']) dfClean.show(5) hashingTF = HashingTF(inputCol="words", outputCol="rawtf-idf", numFeatures=1000) tf = hashingTF.transform(dfClean) idf = IDF(inputCol="rawtf-idf", outputCol="features").fit(tf) dfFinal = idf.transform(tf) # Fit on whole dataset to include all labels in index. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(dfFinal) # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(dfFinal) # Split the data into training and test sets (20% held out for testing) (trainingData, testData) = dfFinal.randomSplit([0.8, 0.2]) # Train the model. #rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") nb = NaiveBayes(smoothing = 1.0, labelCol="indexedLabel", featuresCol="indexedFeatures")
def trainModel(self): logger.info("Training the model...") query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;''' def SQLtoURL(query): data = query.replace('\n', ' ').replace('\t',' ').replace(' ',' ').replace(' ',' ') return data def QueryXXXXX(query, file = None): session = Session() response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},) return response.content table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table'] title_list = [x['c'] for x in table['rows']] table_cols = [d['label'] for d in table['cols']] def convert_row(row): rowlist = [d['v'] for d in row] return rowlist rd = self.sc.parallelize(title_list).map(convert_row) titleData = self.spark.createDataFrame(rd, table_cols) titleData = titleData.dropna() hebrew_stopwords = stop_words() def rmv(words): for punc in punctuation: words = words.replace(punc,"") for hword in hebrew_stopwords: words = words.replace(hword, " ") return words self.spark.udf.register("rmv", rmv, StringType()) titleData.registerTempTable("wordstable") cleanedSentenceData = self.spark.sql("select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable") tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words") wordsData = tokenizer.transform(cleanedSentenceData) cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF = 2.0) cvModel = cv.fit(wordsData) featurizedData = cvModel.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) lda = LDA(k=100) ldaModel = lda.fit(rescaledData) postFactorizedData = ldaModel.transform(rescaledData) norm = Normalizer(inputCol = "topicDistribution", outputCol="normTopicDist") scaledFactorizedNormalizedData = norm.transform(postFactorizedData) self.model = scaledFactorizedNormalizedData logger.info("model is built!")
#configuring spark conf = SparkConf() conf.setAppName("part2_uni") conf.set("spark.executor.memory", "2g") sc = SparkContext(conf=conf) #reading input lines = sc.wholeTextFiles("/cosc6339_s17/books-longlist/") #configuring SparkSession spark = SparkSession(sc) hasattr(lines, "toDF") #tokeinizing the words and converting into dataframes tokenize = lines.map(part2).toDF(["bookname", "words"]) #converting into unigrams unigram = NGram(n=2, inputCol="words", outputCol="unigrams") unigramdataframe = unigram.transform(tokenize) #finding the tf value hashingTF = HashingTF(inputCol="unigrams", outputCol="unigram-tf") tf = hashingTF.transform(unigramdataframe) #finding the idf value idf = IDF(inputCol="unigram-tf", outputCol="unigram-tf-idf") idfModel = idf.fit(tf) tfidfignore = idfModel.transform(tf) #saving the output tfidfignore.rdd.saveAsTextFile("/bigd12/output2_2")
'45G').set('spark.driver.maxResultSize', '10G') sc = SparkContext(conf=spark) sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(indexNewsList, ["label", "text"]) tokenizer = Tokenizer(inputCol="text", outputCol="words") remover = StopWordsRemover(inputCol="words", outputCol="filtered") hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000) idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) nb = NaiveBayes(smoothing=1.0, modelType="multinomial") rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100, maxDepth=15, maxBins=32) pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf]) pipeline1 = Pipeline(stages=[tokenizer, remover, hashingTF, idf, rf]) pipeline_nb = pipeline1.fit(df) pipelineFit = pipeline.fit(df) pipeline_nb.write().overwrite().save("model_rf1") dataset = pipelineFit.transform(df)
# 停用词 add_stopwords = [ 'the', 'of', 'in', 'a', 'an', 'at', 'as', 'on', 'for', 'it', 'we', 'you', 'want', 'up', 'to', 'if', 'are', 'is', 'and', 'our', 'with', 'from', '-', 'your', 'so' ] stopwords_remover = StopWordsRemover( inputCol='desc_words', outputCol='desc_words_filtered').setStopWords(add_stopwords) df = stopwords_remover.transform(df) # 计算每篇文档的TF-IDF hashingTF = HashingTF(inputCol='desc_words_filtered', outputCol="desc_words_tf") tf = hashingTF.transform(df).cache() idf = IDF(inputCol='desc_words_tf', outputCol="desc_words_tfidf").fit(tf) tfidf = idf.transform(tf).cache() print('\n 每个酒店的TFIDF') tfidf.select('desc_words_tfidf').show(truncate=False) # 数据规范化 from pyspark.ml.feature import Normalizer normalizer = Normalizer(inputCol="desc_words_tfidf", outputCol="norm") tfidf = normalizer.transform(tfidf) tfidf.select("id", "norm").show() import pyspark.sql.functions as psf from pyspark.sql.types import DoubleType dot_udf = psf.udf(lambda x, y: float(x.dot(y)), DoubleType()) #tfidf = tfidf.alias("a1").join(tfidf.alias("a2"), psf.col("a1.id") < psf.col("a2.id")).withColumn('similarity', dot_udf("a1.norm", "a2.norm")) #tfidf.show()
def main(input_file, bus_parquet, input_model): data = spark.read.parquet(input_file) df_business = spark.read.parquet(bus_parquet) df = data.select('business_id', 'text') #df_review = df.groupby('business_id').agg(functions.collect_set('text')).show(100) review_rdd = df.rdd.map(tuple).reduceByKey(operator.add) review_df = spark.createDataFrame(review_rdd).withColumnRenamed('_1', 'business_id').withColumnRenamed('_2', 'text') # create text preprocessing pipeline # Build the pipeline # tokenize review regexTokenizer = RegexTokenizer(gaps=False, pattern='\w+', inputCol='text', outputCol='text_token') #yelpTokenDF = regexTokenizer.transform(review_df) # filter stopwords stopWordsRemover = StopWordsRemover(inputCol='text_token', outputCol='nonstopwrd') #yelp_remove_df = stopWordsRemover.transform(yelpTokenDF) # TF countVectorizer = CountVectorizer(inputCol = 'nonstopwrd', outputCol='raw_features', minDF=2) #yelp_CountVec = cv.transform(yelp_remove_df) # IDF idf = IDF(inputCol="raw_features", outputCol="features") pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, countVectorizer, idf]) #tfidf_model = pipeline.fit(review_df) #tfidf_model.write().overwrite().save('tfidf_model') tfidf_model = PipelineModel.load('tfidf_model') result_tfidf = tfidf_model.transform(review_df) yelp = result_tfidf lda = LDA(k=15, maxIter=100) # already saved model #model = lda.fit(yelp) # save model #model.write().overwrite().save(input_model) model = LocalLDAModel.load(input_model) # lda output column topicDistribution lda_df = model.transform(yelp) # test result x = sc.parallelize([('aaa', 'chicken cheese burger')]).toDF(['business_id', 'text']) x_tfidf = tfidf_model.transform(x) lda_x = model.transform(x_tfidf) input_vec = lda_x.select('topicDistribution').collect()[0][0] lda_vec = lda_df.select('business_id', 'topicDistribution').rdd.map(lambda x: (x[0], x[1])).collect() # compute similarity t = sc.parallelize((i[0], float(CosineSim(input_vec, i[1]))) for i in lda_vec) # recommendation's cosine values similarity = spark.createDataFrame(t).withColumnRenamed('_1', 'business_id').withColumnRenamed('_2', 'similarity_score') df_result = df_business.join(similarity, 'business_id', 'right').select(similarity['business_id'] ,'similarity_score', 'categories').orderBy('similarity_score', ascending = False) result = getKeyWordsRecoms('chicken cheese burger', 20, tfidf_model, model, lda_vec) result.show()
from pyspark.ml.feature import CountVectorizer # we will remove words that appear in 5 docs or less cv = CountVectorizer(minTF=1., minDF=5., vocabSize=2**17) .setInputCol("filtered") .setOutputCol("tf") # In[24]: # we now create a pipelined transformer cv_pipeline = Pipeline(stages=[tokenizer, sw_filter, cv]).fit(airportCleanDF) # In[25]: from pyspark.ml.feature import IDF idf = IDF(). setInputCol('tf'). setOutputCol('tfidf') # In[26]: idf_pipeline = Pipeline(stages=[cv_pipeline, idf]).fit(airportCleanDF) # In[27]: tfidf_df = idf_pipeline.transform(airportCleanDF) # In[28]: tfidf_df.printSchema()
print('data frame where common words are removed'.upper()) remover.transform(sentence_DF).show() # n-grams: sequence of tokens of consecutive 'n' words from pyspark.ml.feature import NGram ngram = NGram(n=2, inputCol='words', outputCol='grams') ngram.transform(tok_final).show() tok_final_n = ngram.transform(tok_final) tok_final_n.select('grams').show(truncate=False) # The n-grams help explore relationships between close words from pyspark.ml.feature import HashingTF, IDF hashing_tf = HashingTF(inputCol='words',outputCol='rawFeatures') feature_data = hashing_tf.transform(tok_final) idf = IDF(inputCol='rawFeatures',outputCol='Features') idf_model = idf.fit(feature_data) rescaled_data = idf_model.transform(feature_data) # See how words were transformed into numbers, this is ready for a supervides machine learning algorithm rescaled_data.select('id','Features').show(truncate=False) ############ from pyspark.ml.feature import CountVectorizer df = spark.createDataFrame([(0, ['hello', 'are', 'you', 'man']),(1,['hello', 'hello', 'man', 'I', 'am', 'great', 'I', 'am', 'fantastic', 'you', '?', 'you', 'okay', '?'])],['id', 'tokens']) cv = CountVectorizer(inputCol='tokens', outputCol='countVec', vocabSize=10, minDF=2.0) # minDF: minimum number of documents in which a word must belong in order to be considered as a feature cv.fit(df).transform(df).show(truncate=False) print("Note that 'hello' and 'you' were repeated twice in the last document")
featurizeData = hashingTF.transform(wordsData) featurizeData.select("words", "rawFeatures").show(truncate=False) #%% """ CountVectorizer词频统计(可以将词频对应上单词)""" countVector = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF=2) cvModel = countVector.fit(wordsData) cv_df = cvModel.transform(wordsData) cv_df.show(4, False) #%% # voc=cvModel.vocabulary # getKeywordFunc=udf() # %% """ IDF模型训练 """ idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(cv_df) rescaledData = idfModel.transform(cv_df) # %% list = rescaledData.collect() # with open("./collect_file.txt","w+") as f: # f.write(str(list)) # %% Features = rescaledData.select("features").toPandas() Words = rescaledData.select("words").toPandas() #%% features_dict = Features.to_dict() # with open("./features_dict.txt","w") as f: # f.write(str(features_dict["features"])) # %% features_numpy = np.array(Features)
data = spark.read.load(sys.argv[1]) df = data.filter((col('date') >= '1895') & (col('seq') =='1')) \ .select(year('date').alias('year'), 'id', 'text') # https://danvatterott.com/blog/2018/07/08/aggregating-sparse-and-dense-vectors-in-pyspark/ def dense_to_array(v): new_array = list([float(x) for x in v]) return new_array dense_to_array_udf = udf(dense_to_array, ArrayType(FloatType())) indexer = StringIndexer(inputCol="id", outputCol="label") tokenizer = Tokenizer(inputCol="text", outputCol="tokens") vectorizer = CountVectorizer(inputCol="tokens", outputCol="rawFeatures") idf = IDF(inputCol="rawFeatures", outputCol="vector", minDocFreq=1) pipeline = Pipeline(stages=[indexer, tokenizer, vectorizer, idf]) model = pipeline.fit(df) results = model.transform(df) \ .select(year('date').alias('year'), 'label', 'vector') \ .withColumn('vector', dense_to_array_udf('vector')) results = model.transform(df).select('year', 'label', 'vector') results.write \ .partitionBy('year') \ .format('csv') \ .options(compression='gzip', sep='\t', header='true') \ .save(sys.argv[2])
df_seg = df.withColumn('seg', seg_udf(df.content)).select('seg') df_seg.show() # 将分词做成 tokenizer = Tokenizer(inputCol='seg', outputCol='words') df_seg_arr = tokenizer.transform(df_seg).select('words') df_seg_arr.show() # 切词之后的文本特征的处理 tf = HashingTF(numFeatures=1 << 18, binary=False, inputCol='words', outputCol='rawfeatures') df_tf = tf.transform(df_seg_arr).select('rawfeatures') df_tf.show() idf = IDF(inputCol='rawfeatures', outputCol='features') idfModel = idf.fit(df_tf) df_tfidf = idfModel.transform(df_tf) df_tfidf.show() # 切分训练集和预测集 splits = df_tfidf.randomSplit([0.7, 0.3], 1234) train = splits[0] test = splits[1] # 定义模型 kmeans = KMeans(featuresCol="features", predictionCol="prediction", k=6, initMode="k-means||", initSteps=5,
from pyspark.ml.feature import HashingTF, IDF, Tokenizer from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() spark.sparkContext.setLogLevel("ERROR") sentenceData = spark.createDataFrame([ (0.0, "a b b a a a b"), (0.0, "a b a b b a a"), (1.0, "b a b bb aa b") ], ["label", "sentence"]) # TF tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) # alternatively, CountVectorizer can also be used to get term frequency vectors # IDF idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show(20, False) spark.stop()
review_text = BeautifulSoup(raw_review).text # # 2. Remove non-letters letters_only = re.sub("[^a-zA-Z]", " ", review_text) # # 3. Convert to lower case, split into individual words words = letters_only.lower().split() # # 4. Remove stop words meaningful_words = [w for w in words if not w in stops] # # 5. Join the words back into one string separated by space, # and return the result. return " ".join( meaningful_words) stops = set(stopwords.words("english")) lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv") rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys() parts = rows.map(lambda l: l.split("\t")) review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), review=review_to_words(p[2]))) schemeReview = sqlContext.createDataFrame(review) tokenizer = Tokenizer(inputCol="review", outputCol="words") wordsData = tokenizer.transform(schemeReview) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) selectData = rescaledData.select("label","features")
pattern="[a-zA-Z]+") ## Remove ignored words stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) if algo == "gbm": ## Create GBM model algoStage = H2OGBM(ratio=0.8, seed=1, featuresCols=[idf.getOutputCol()], predictionCol="label") elif algo == "dl": ## Create H2ODeepLearning model algoStage = H2ODeepLearning(epochs=10, seed=1, l1=0.001, l2=0.0,
wrangled.show(4, truncate=False) -------------------------------------------------- # Exercise_11 from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF # Remove stop words. wrangled = StopWordsRemover(inputCol='words', outputCol='terms')\ .transform(sms) # Apply the hashing trick wrangled = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024)\ .transform(wrangled) # Convert hashed symbols to TF-IDF tf_idf = IDF(inputCol='hash', outputCol='features')\ .fit(wrangled).transform(wrangled) tf_idf.select('terms', 'features').show(4, truncate=False) -------------------------------------------------- # Exercise_12 # Split the data into training and testing sets sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed=13) # Fit a Logistic Regression model to the training data logistic = LogisticRegression(regParam=0.2).fit(sms_train) # Make predictions on the testing data prediction = logistic.transform(sms_test)
tfIdfIn = tokenized\ .where("array_contains(DescOut, 'red')")\ .select("DescOut")\ .limit(10) tfIdfIn.show(10, False) # COMMAND ---------- from pyspark.ml.feature import HashingTF, IDF tf = HashingTF()\ .setInputCol("DescOut")\ .setOutputCol("TFOut")\ .setNumFeatures(10000) idf = IDF()\ .setInputCol("TFOut")\ .setOutputCol("IDFOut")\ .setMinDocFreq(2) # COMMAND ---------- idf.fit(tf.transform(tfIdfIn)).transform(tf.transform(tfIdfIn)).show(10, False) # COMMAND ---------- from pyspark.ml.feature import Word2Vec # Input data: Each row is a bag of words from a sentence or document. documentDF = spark.createDataFrame([ ("Hi I heard about Spark".split(" "), ), ("I wish Java could use case classes".split(" "), ),
from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .appName("tf_idf_sample") \ .master("local[*]") \ .getOrCreate() df1 = spark.createDataFrame([ (0, "a a a b b c"), (0, "a b c"), (1, "a c a a d")]).toDF("label", "sentence") tokenizer = Tokenizer(inputCol="sentence", outputCol="words") # 각 문장을 단어로 분리 df2 = tokenizer.transform(df1) hashingTF = HashingTF(inputCol="words", outputCol="TF-Features", numFeatures=20) df3 = hashingTF.transform(df2) df3.cache() idf = IDF(inputCol="TF-Features", outputCol="Final-Features") idfModel = idf.fit(df3) rescaledData = idfModel.transform(df3) rescaledData.select("words", "TF-Features", "Final-Features").show() spark.stop
from pyspark.ml.feature import RegexTokenizer tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W+") # COMMAND ---------- # MAGIC %md # MAGIC Create a `HashingTF` transformer to hash words to buckets with counts, then use an `IDF` estimator to compute inverse-document frequency for buckets based on how frequently words have hashed to those buckets in the given documents. Next, normalize the tf-idf values so that the \\( l^2 \\) norm is one for each row. # COMMAND ---------- from pyspark.ml.feature import IDF, HashingTF, Normalizer hashingTF = HashingTF().setNumFeatures(10000).setInputCol(tokenizer.getOutputCol()).setOutputCol("hashingTF") idf = IDF().setMinDocFreq(10).setInputCol(hashingTF.getOutputCol()).setOutputCol("idf") normalizer = Normalizer().setInputCol(idf.getOutputCol()).setOutputCol("features") # COMMAND ---------- # MAGIC %md # MAGIC Now, let's build the `KMeans` estimator and a `Pipeline` that will contain all of the stages. We'll then call fit on the `Pipeline` which will give us back a `PipelineModel`. This will take about a minute to run. # COMMAND ---------- from pyspark.ml import Pipeline from pyspark.ml.clustering import KMeans kmeans = KMeans().setFeaturesCol("features").setPredictionCol("prediction").setK(5).setSeed(0)
stopwords = StopWordsRemover(inputCol="tokens", outputCol="tokens_filtered") newsgroups = stopwords.transform(newsgroups) newsgroups = newsgroups.drop('tokens') count_vec = CountVectorizer(inputCol="tokens_filtered", outputCol="tf_features", vocabSize=num_features, minDF=2.0) count_vec_model = count_vec.fit(newsgroups) vocab = count_vec_model.vocabulary newsgroups = count_vec_model.transform(newsgroups) newsgroups = newsgroups.drop('tokens_filtered') #hashingTF = HashingTF(inputCol="tokens_filtered", outputCol="tf_features", numFeatures=num_features) #newsgroups = hashingTF.transform(newsgroups) #newsgroups = newsgroups.drop('tokens_filtered') idf = IDF(inputCol="tf_features", outputCol="features") newsgroups = idf.fit(newsgroups).transform(newsgroups) newsgroups = newsgroups.drop('tf_features') lda = LDA(k=num_topics, featuresCol="features", seed=0) model = lda.fit(newsgroups) topics = model.describeTopics() topics.show() model.topicsMatrix() topics_rdd = topics.rdd topics_words = topics_rdd\ .map(lambda row: row['termIndices'])\
yield end = time() print(f'Elapsed time: {end - start: .4f}s') algorithm = algorithms[args.algorithm] spark = SparkSession.builder.appName(args.app_name).getOrCreate() with timer(): print('[INFO] Reading time') rdd = spark.sparkContext.textFile(os.path.join('dataset', 'train.ft.txt')) rdd.cache() tokenizer = Tokenizer(inputCol = 'rawContent', outputCol = 'words') hashing_tf = HashingTF(numFeatures = args.num_features, inputCol = 'words', outputCol = 'rawFeatures') idf = IDF(inputCol = 'rawFeatures', outputCol = 'features') label_indexer = StringIndexer(inputCol = 'rawLabel', outputCol = 'label') with timer(): print('[INFO] Preprocessing time') df = rdd.map(lambda x: (x[:10], x[11: ])).toDF(['rawLabel', 'rawContent']) df = tokenizer.transform(df) tf_df = hashing_tf.transform(df) # tf_df.cache() idf_model = idf.fit(tf_df) encoded_df = idf_model.transform(tf_df) training_df = encoded_df.select('rawLabel', 'features') labelModel = label_indexer.fit(training_df) training_df = labelModel.transform(training_df) if algorithm in ['LogisticRegression', 'LinearSVC']:
just_text = news_types.map(lambda line: [line[0], line[4]]) fields = [StructField('id', StringType(), True),StructField('text', StringType(), True)] schema = StructType(fields) #We assume that there is an rdd called just_text with lists containing the tweet id and text in each row #just_text = [[id1, text1][id2, text2]....] data_df = sqlContext.createDataFrame(just_text, schema) tokenizer = Tokenizer(inputCol = "text", outputCol ="words") tokenizedData = tokenizer.transform(data_df) hashingTF = HashingTF(inputCol = "words", outputCol = "tf", numFeatures = 2**16) tfData = hashingTF.transform(tokenizedData) idf = IDF(inputCol = "tf", outputCol = "features") idfModel = idf.fit(tfData) finalData = idfModel.transform(tfData) model = LogisticRegressionModel.load('/user/maria_dev/user/maria_dev/sentimentModel') predictions = model.transform(finalData) predictions2 = predictions.select(predictions.id, predictions.text, predictions.prediction) #To create a regular rdd predictions_rdd = predictions2.rdd.map(list) predictions_without_text = predictions_rdd.map(lambda line: [line[0], line[2]])
#spark = pyspark.sql.SparkSession(sc) df = spark.read.format("csv").option("inferschema", "true").option( "header", "true").option("delimiter", "\t").load("trainReviews.tsv") tokenizer = Tokenizer(inputCol="text", outputCol="words") wordsData = tokenizer.transform(df) wordsData.show(5) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") tf = hashingTF.transform(wordsData) tf.show(10) tf.head().rawFeatures idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2).fit(tf) tfidf = idf.transform(tf) ml = LogisticRegression(featuresCol="features", labelCol='category', regParam=0.01) mlModel = ml.fit(tfidf.limit(5000)) res_train = mlModel.transform(tfidf) extract_prob = F.udf(lambda x: float(x[1]), T.FloatType()) res_train.withColumn("proba", extract_prob("probability")).select( "id", "proba", "prediction").show() test_df = spark.read.format("csv").option("inferschema", "true").option( "header", "true").option("delimiter", "\t").load("testReviews.tsv") tokenizer = Tokenizer(inputCol="text", outputCol="words")
# 분류 모델 구축 # 모델링 매개변수 numFeatures = 5000 minDocFreq = 50 numTrees = 1000 # 머신 러닝 파이프라인 구축 inx1 = StringIndexer(inputCol="hour", outputCol="hour-inx") inx2 = StringIndexer(inputCol="month", outputCol="month-inx") inx3 = StringIndexer(inputCol="dayofweek", outputCol="dow-inx") inx4 = StringIndexer(inputCol="sentiment", outputCol="label") hashingTF = HashingTF(numFeatures=numFeatures, inputCol="words", outputCol="hash-tf") idf = IDF(minDocFreq=minDocFreq, inputCol="hash-tf", outputCol="hash-tfidf") va = VectorAssembler(inputCols=[ "hour-inx", "month-inx", "dow-inx", "hash-tfidf", "pscore", "nscore" ], outputCol="features") rf = RandomForestClassifier(numTrees=numTrees, maxDepth=4, maxBins=32, labelCol="label", seed=42) p = Pipeline(stages=[inx1, inx2, inx3, inx4, hashingTF, idf, va, rf]) # 훈련용 테스용 데이터 분류 (trainSet, testSet) = hc.table("fm").randomSplit([0.7, 0.3]) trainData = trainSet.cache() testData = testSet.cache()
rdd = labeledRdd.map(lambda doc: (cleanLower(doc[0]), doc[1])) print "Text is cleaned" sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ["review", "label"]) dfTrain, dfTest = df.randomSplit([0.8, 0.2]) print "Random split is done" tokenizerNoSw = tr.NLTKWordPunctTokenizer( inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words("english")) ) hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf") idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed") dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10) pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt]) # **************************************************************** # *********************CROSS VALIDATION: 80%/20%****************** # *******************Model: DecisionTreeClassifier***************** # ***************************************************************** evaluator = MulticlassClassificationEvaluator( predictionCol="prediction", labelCol="target_indexed", metricName="precision" )
# remover.transform(sentence_df).show() # # # n-gram # from pyspark.ml.feature import NGram # # ngram = NGram(n=2, inputCol='tokens', outputCol='grams') # ngram.transform(sentence_df).show() # ngram.transform(sentence_df).select('grams').show(truncate=False) from pyspark.ml.feature import HashingTF, IDF, CountVectorizer hashing_tf = HashingTF(inputCol='words', outputCol='rawFeatures') featurized_data = hashing_tf.transform(rg_tokenized) # featurized_data.show() idf = IDF(inputCol='rawFeatures', outputCol='features') idf_model = idf.fit(featurized_data).transform(featurized_data) idf_model.select('id', 'features').show() # count vectorizer df = spark.createDataFrame([ (0, "a b c".split(" ")), (1, "a b b c a".split(" ")), ], ['id', 'words']) cv = CountVectorizer(inputCol='words', outputCol='features', vocabSize=3, minDF=2.0) model = cv.fit(df).transform(df)
|4 |Win a cash prize or a prize worth|1 | +---+---------------------------------+-----+ only showing top 4 rows """ from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF # Break text into tokens at non-word characters tokenizer = Tokenizer(inputCol='text', outputCol='words') # Remove stop words remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms') # Apply the hashing trick and transform to TF-IDF hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash") idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features") # Create a logistic regression object and add everything to a pipeline logistic = LogisticRegression() pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic]) print(pipeline) """ Cross validating simple flight duration model You've already built a few models for predicting flight duration and evaluated them with a simple train/test split. However, cross-validation provides a much better way to evaluate model performance. In this exercise you're going to train a simple model for flight duration using cross-validation. Travel time is usually strongly correlated with distance,
text = text.split() return text # In[5]: clean_udf = udf(text_split, ArrayType(StringType())) df = df.withColumn("body", clean_udf("body")) # In[6]: #following section transforms the text using TFIDF start = time.clock() hashingTF = HashingTF(inputCol="body", outputCol="term_freq") df = hashingTF.transform(df) idf = IDF(inputCol="term_freq", outputCol="tfidf") idfModel = idf.fit(df) df = idfModel.transform(df) print("pyspark TFIDF processing time: {0:.5f} s".format(time.clock() - start)) # ## 4. Building a Naive Bayes Classifier # # The first step is to convert the topics (nominal) to a list of discrete integers # In[7]: #Using the OneHotEncoder to convert the topics into discrete integers stringIndexer = StringIndexer(inputCol="topic", outputCol="topicIndex") model = stringIndexer.fit(df) indexed = model.transform(df)
def main(sc, sqlContext): #start = timer() #print '---Pegando usuario, posts, tokens e categorias do MongoDB---' #start_i = timer() user = findUserById(iduser) posts = findPosts(user) tokens, category, categoryAndSubcategory = getTokensAndCategories() postsRDD = (sc.parallelize(posts).map(lambda s: (s[0], word_tokenize(s[1].lower()), s[2], s[3])) .map(lambda p: (p[0], [x for x in p[1] if x in tokens] ,p[2], p[3])) .cache()) #print '####levou %d segundos' % (timer() - start_i) #print '---Pegando produtos do MongoDB---' #start_i = timer() #print '####levou %d segundos' % (timer() - start_i) #print '---Criando corpusRDD---' #start_i = timer() stpwrds = stopwords.words('portuguese') corpusRDD = (postsRDD.map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3])) .filter(lambda x: len(x[1]) >= 20 or (x[2] == u'Post' and len(x[1])>0)) .cache()) #print '####levou %d segundos' % (timer() - start_i) #print '---Calculando TF-IDF---' #start_i = timer() wordsData = corpusRDD.map(lambda s: Row(label=int(s[0]), words=s[1], type=s[2])) wordsDataDF = sqlContext.createDataFrame(wordsData).unionAll(sqlContext.read.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet")) numTokens = len(tokens) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens) idf = IDF(inputCol="rawFeatures", outputCol="features") featurizedData = hashingTF.transform(wordsDataDF) idfModel = idf.fit(featurizedData) tfIDF = idfModel.transform(featurizedData).cache() postTFIDF = (tfIDF .filter(tfIDF.type==u'Post') #.map(lambda s: Row(label=s[0], type=s[1], words=s[2], rawFeatures=s[3], features=s[4], sentiment=SVM.predict(s[4]))) .cache()) #postTFIDF = postTFIDF.filter(lambda p: p.sentiment == 1) #print '####levou %d segundos' % (timer() - start_i) #print '---Carregando modelo---' #start_i = timer() NB = NaiveBayesModel.load(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria') SVM = SVMModel.load(sc, "/home/ubuntu/recsys-tcc-ml/models/svm") #print '####levou %d segundos' % (timer() - start_i) #print '---Usando o modelo---' #start_i = timer() predictions = (postTFIDF .map(lambda p: (NB.predict(p.features), p[0], SVM.predict(p.features))) .filter(lambda p: p[2]==1) .map(lambda p: (p[0], p[1])) .groupByKey() .mapValues(list) .collect()) #print '####levou %d segundos' % (timer() - start_i) #print '---Calculando similaridades---' #start_i = timer() suggestions = [] for prediction in predictions: category_to_use = category[int(prediction[0])] #print ' Calculando similaridades para a categoria: {}'.format(category_to_use) tf = tfIDF.filter(tfIDF.type==category_to_use).cache() for post in prediction[1]: postVector = postTFIDF.filter(postTFIDF.label == post).map(lambda x: x.features).collect()[0] sim = (tf .map(lambda x: (post, x.label, cossine(x.features, postVector))) .filter(lambda x: x[2]>=threshold) .collect()) if len(sim) > 0: suggestions.append(sim) #print '####levou %d segundos' % (timer() - start_i) if len(suggestions) > 0: #print '---Inserindo recomendacoes no MongoDB---' #start_i = timer() insertSuggestions(suggestions, iduser, posts)
normalizerBi = Normalizer(inputCol="bigrams",outputCol='normBigrams',p=2.0) dfNorm = normalizerUni.transform(dfVect2) dfNorm2 = normalizerBi.transform(dfNorm) print "DataFrame(bi-gram): normalisé" dfNorm2.select('words','normWords').show() # La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements # non nuls et pas leur valeur # On passe au TFIDF # Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs # à n'importz quelle colonne (bigrammes, avec stop words ou sans...) from pyspark.ml.feature import HashingTF htf = HashingTF(inputCol='words',outputCol='wordsTF',numFeatures=10000) dfTrainTF = htf.transform(dfTrainTokNoSw) # INverse doc frequency from pyspark.ml.feature import IDF idf = IDF(inputCol=htf.getOutputCol(),outputCol="wordsTFIDF") idfModel = idf.fit(dfTrainTF) dfTrainTFIDF = idfModel.transform(dfTrainTF) dfTrainTFIDF.select('review','wordsTF','wordsTFIDF').show() # Je sais que cette étape m'a été utile une fois, la ça a pas trop l'air from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(dfTrainTFIDF) dfTrainFinal = string_indexer_model.transform(dfTrainTFIDF) dfTrainFinal.select('review','label','target_indexed').show() #********************************************************************** #-----------Training the model for prediction--------------------------