##modeling api_f = [ 'attributes.RestaurantsPriceRange2', 'business_id', 'stars', 'review_count', 'categories' ] cv = CountVectorizer(minDF=10, vocabSize=5000, inputCol='token', outputCol='vectors') km1 = KMeans(k=20, featuresCol='vectors', maxIter=30) pipe_count = Pipeline(stages=[cv, km1]) idf = IDF(inputCol="vector", outputCol="features") km2 = KMeans(k=20, featuresCol='features', maxIter=30) pipe_idf = Pipeline(stages=[cv, idf, km2]) ###fitting #train_vect = data_tokenizer(dataset) #model_cv_km = pipe_count.fit(train_vect) #model_tf_km = pipe_count.fit(train_vect) def cluster_user_by_review(data_review, model): pred = model.transform(data_review) data = pred.select('user_id', 'prediction').withColumnRenamed('prediction', 'user_cl') data = data.dropDuplicates()
data_df = spark.createDataFrame(typed_rdd, ["text", "label"]) data_set = data_df.select(data_df['label'], data_df['text']) #splitting data to train and test training_df, test_df = data_set.randomSplit([0.7, 0.3]) training_df.head(5) from pyspark.ml.feature import HashingTF, IDF, Tokenizer from pyspark.ml import Pipeline from pyspark.ml.classification import NaiveBayes from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.tuning import ParamGridBuilder from pyspark.ml.tuning import CrossValidator tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") idf = IDF(minDocFreq=3, inputCol="features", outputCol="idf") nb = NaiveBayes() pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, nb]) paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 1.0]).build() cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(), numFolds=4) cvModel = cv.fit(training_df) result = cvModel.transform(test_df) prediction_df = result.select("text", "label", "prediction")
wordsData = tokenizer.transform(clean_jobs) # remove stopwords remover = StopWordsRemover(inputCol="words", outputCol="filtered") filteredWords = remover.transform(wordsData) # get ngrams ngram = NGram(inputCol="filtered", outputCol="featureGrams") gramData = ngram.transform(filteredWords) # create TFIDF of these hashingTF = HashingTF(inputCol="featureGrams", outputCol="rawFeatures", numFeatures=350) featurizedData = hashingTF.transform(gramData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.cache() rescaledData.count() # now filter all the criteria you care about: ds_preds = rescaledData.where( (col("title").like("%Machine Learn%")) | (col("title").like("%Data Scientist%")) | (col("title").like("%Artificial Intel%")) | (col("title").like("%Analytic%")) | (col("title").like("%Statist%")) | (col("title").like("%ML%")) | (col("title").like("%AI%")) | (col("title").like("%Data Engin%")) | (col("title").like("%Programmer%"))) #analytics cluster
# 1.Tokenize the title, ignore emoji and etc. regular expression title_tokenizer = RegexTokenizer(inputCol='title', outputCol='title_words', pattern='\\W', toLowercase=True) # 2.Remove stopwords from title title_sw_remover = StopWordsRemover(inputCol='title_words', outputCol='title_sw_removed') # 3.Compute Term frequency from title title_count_vectorizer = CountVectorizer(inputCol='title_sw_removed', outputCol='tf_title') # 4.Compute TF-IDF from title title_tfidf = IDF(inputCol='tf_title', outputCol='tf_idf_title') # 5.Tokenize the text, ignore emoji and etc. regular expression text_tokenizer = RegexTokenizer(inputCol='text', outputCol='text_words', pattern='\\W', toLowercase=True) # 6.Remove stopwords from text text_sw_remover = StopWordsRemover(inputCol='text_words', outputCol='text_sw_removed') # 7.Compute Term frequency from text text_count_vectorizer = CountVectorizer(inputCol='text_sw_removed', outputCol='tf_text')
# list of stopwords to be removed from the posts StopWords = list(set(stopwords.words('english'))) labelIndexer = StringIndexer(inputCol="tags", outputCol="label").fit(train) bs_text_extractor = BsTextExtractor(inputCol="post", outputCol="untagged_post") RegexTokenizer = RegexTokenizer(inputCol=bs_text_extractor.getOutputCol(), outputCol="words", pattern="[^0-9a-z#+_]+") StopwordRemover = StopWordsRemover( inputCol=RegexTokenizer.getOutputCol(), outputCol="filtered_words").setStopWords(StopWords) CountVectorizer = CountVectorizer(inputCol=StopwordRemover.getOutputCol(), outputCol="countFeatures", minDF=5) idf = IDF(inputCol=CountVectorizer.getOutputCol(), outputCol="features") rf = RandomForestClassifier(labelCol="label", featuresCol=idf.getOutputCol(), numTrees=100, maxDepth=4) idx_2_string = IndexToString(inputCol="prediction", outputCol="predictedValue") idx_2_string.setLabels(labelIndexer.labels) # creating the pipeline pipeline = Pipeline(stages=[ labelIndexer, bs_text_extractor, RegexTokenizer, StopwordRemover, CountVectorizer, idf, rf, idx_2_string ]) # fitting the model model = pipeline.fit(train)
inferSchema=True, sep='\t') data.show() data = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text') data.show() data = data.withColumn('length', length(data['text'])) data.show() tokenizer = Tokenizer(inputCol='text', outputCol='tokens') stop_remove = StopWordsRemover(inputCol='tokens', outputCol='stop_tokens') count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='count_vec') idf = IDF(inputCol='count_vec', outputCol='tf_idf') label_index = StringIndexer(inputCol='class', outputCol='label') clean_up = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features') nb = NaiveBayes() data_pipe = Pipeline( stages=[label_index, tokenizer, stop_remove, count_vec, idf, clean_up]) cleaned = data_pipe.fit(data).transform(data) cleaned.show()
# import the raw data from the dataset data = spark.read.csv('data/training.1600000.processed.noemoticon.csv', inferSchema=True) data = data.select(['_c0', '_c5']).withColumnRenamed( '_c0', 'class').withColumnRenamed('_c5', 'text') # build the preprocessing pipeline # change label value from 0, 4 to 0, 1 stringIndexer = StringIndexer(inputCol='class', outputCol='label') tokenizer = Tokenizer(inputCol='text', outputCol='tokens') stopwordsRemover = StopWordsRemover(inputCol='tokens', outputCol='tokens_filtered') countVectorizer = CountVectorizer(inputCol='tokens_filtered', outputCol='count_vec') # hashTF = HashingTF(numFeatures=2**16, inputCol="tokens_nonstop", outputCol='tf') idf = IDF(inputCol='count_vec', outputCol='features', minDocFreq=5) # minDocFreq: remove sparse terms nb = NaiveBayes() lr = LogisticRegression(maxIter=100) customFilter = CustomFilter() # pipeline = Pipeline(stages=[stringIndexer, tokenizer, stopwordsRemover, hashTF, idf, nb]) # pipeline = Pipeline(stages=[stringIndexer, tokenizer, stopwordsRemover, countVectorizer, idf, nb]) pipeline = build_ngrams(3) data_train, data_val = data.randomSplit([0.8, 0.2]) model = pipeline.fit(data_train) train_pred = model.transform(data_train) val_pred = model.transform(data_val) # val_acc = val_pred.filter(val_pred.label==val_pred.prediction).count() / float(data_val.count()) val_pred.show()
######################################################################################### #Stop words and hashing from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF # Remove stop words. wrangled = StopWordsRemover(inputCol='words', outputCol='terms')\ .transform(sms) # Apply the hashing trick wrangled = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024)\ .transform(wrangled) # Convert hashed symbols to TF-IDF tf_idf = IDF(inputCol='hash', outputCol='features')\ .fit(wrangled).transform(wrangled) tf_idf.select('terms', 'features').show(4, truncate=False) ######################################################################################### # Training a spam classifier # Split the data into training and testing sets sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed=13) # Fit a Logistic Regression model to the training data logistic = LogisticRegression(regParam=0.2).fit(sms_train) # Make predictions on the testing data prediction = logistic.transform(sms_test)
def main(input_dir, output_dir): # main logic starts here df_schema = types.StructType([ types.StructField('title_clean', types.StringType()), types.StructField('polarity_subjectivity', types.ArrayType(types.FloatType())), types.StructField('score', types.LongType()), types.StructField('num_comments', types.LongType()), ]) headlines_df = spark.read.json(input_dir, encoding='utf-8', schema=df_schema).repartition(80) split_sentiment_df = headlines_df.withColumn( 'polarity', functions.element_at(headlines_df['polarity_subjectivity'], 1)).withColumn( 'subjectivity', functions.element_at( headlines_df['polarity_subjectivity'], 2)) df_sentiment = split_sentiment_df.withColumn( 'label', get_label(split_sentiment_df['polarity'])) training_set, validation_set = df_sentiment.randomSplit([0.75, 0.25]) headline_vector_size = 3 word_freq_vector_size = 100 tokenizer = Tokenizer(inputCol='title_clean', outputCol='words') headline2Vector = Word2Vec(vectorSize=headline_vector_size, minCount=0, inputCol='words', outputCol='headline_vector') hashingTF = HashingTF(inputCol='words', outputCol='word_counts', numFeatures=word_freq_vector_size) idf = IDF(inputCol='word_counts', outputCol='word_frequecy', minDocFreq=5) headline_vector_size_hint = VectorSizeHint( inputCol='headline_vector', size=headline_vector_size) #need this for streaming word_freq_vector_size_hint = VectorSizeHint( inputCol='word_frequecy', size=word_freq_vector_size) #need this for streaming feature_assembler = VectorAssembler(inputCols=[ 'headline_vector', 'score', 'num_comments', 'subjectivity', 'word_frequecy' ], outputCol='features') dt_classifier = DecisionTreeClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', maxDepth=9) pipeline = Pipeline(stages=[ tokenizer, headline2Vector, hashingTF, idf, headline_vector_size_hint, word_freq_vector_size_hint, feature_assembler, dt_classifier ]) sentiment_model = pipeline.fit(training_set) validation_predictions = sentiment_model.transform(validation_set) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label') validation_score = evaluator.evaluate(validation_predictions) print('Validation score for Sentiment model F1: %g' % (validation_score, )) validation_score_accuracy = evaluator.evaluate( validation_predictions, {evaluator.metricName: "accuracy"}) print('Validation score for Sentiment model Accuracy: %g' % (validation_score_accuracy, )) sentiment_model.write().overwrite().save(output_dir)
[Row(id=1, sentence=u'This is an introduction to Spark MLlib')] >>> sent_tokenized_df.take(1) [Row(id=1, sentence=u'This is an introduction to Spark MLlib', words=[u'this', u'is', u'an', u'introduction', u'to', u'spark', u'mllib'])] >>> hashingTF = HashingTF(inputCol="words",outputCol="rawFeatures",numFeatures=20) >>> sent_hfTF_df = hashingTF.transform(sent_tokenized_df) >>> sent_hfTF_df.show(10,False) # +---+----------------------------------------------------------+------------------------------------------------------------------+------------------------------------------------------+ # |id |sentence |words |rawFeatures | # +---+----------------------------------------------------------+------------------------------------------------------------------+------------------------------------------------------+ # |1 |This is an introduction to Spark MLlib |[this, is, an, introduction, to, spark, mllib] |(20,[1,5,6,8,12,13],[2.0,1.0,1.0,1.0,1.0,1.0]) | # |2 |MLlib includes libraries for classification and regression|[mllib, includes, libraries, for, classification, and, regression]|(20,[1,6,9,12,13,15,16],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])| # |3 |It also contains supporting tools for pipelines |[it, also, contains, supporting, tools, for, pipelines] |(20,[0,8,10,12,15,16],[1.0,1.0,1.0,1.0,1.0,2.0]) | # +---+----------------------------------------------------------+------------------------------------------------------------------+------------------------------------------------------+ >>> sent_hfTF_df.take(1) [Row(id=1, sentence=u'This is an introduction to Spark MLlib', words=[u'this', u'is', u'an', u'introduction', u'to', u'spark', u'mllib'], rawFeatures=SparseVector(20, {1: 2.0, 5: 1.0, 6: 1.0, 8: 1.0, 12: 1.0, 13: 1.0}))] >>> idf = IDF(inputCol='rawFeatures',outputCol='idf_features') >>> idfModel = idf.fit(sent_hfTF_df) >>> tfidf_df = idfModel.transform(sent_hfTF_df) >>> tfidf_df.show(10,False) # +---+----------------------------------------------------------+------------------------------------------------------------------+------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ # |id |sentence |words |rawFeatures |idf_features | # +---+----------------------------------------------------------+------------------------------------------------------------------+------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ # |1 |This is an introduction to Spark MLlib |[this, is, an, introduction, to, spark, mllib] |(20,[1,5,6,8,12,13],[2.0,1.0,1.0,1.0,1.0,1.0]) |(20,[1,5,6,8,12,13],[0.5753641449035617,0.6931471805599453,0.28768207245178085,0.28768207245178085,0.0,0.28768207245178085]) | # |2 |MLlib includes libraries for classification and regression|[mllib, includes, libraries, for, classification, and, regression]|(20,[1,6,9,12,13,15,16],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|(20,[1,6,9,12,13,15,16],[0.28768207245178085,0.28768207245178085,0.6931471805599453,0.0,0.28768207245178085,0.28768207245178085,0.28768207245178085])| # |3 |It also contains supporting tools for pipelines |[it, also, contains, supporting, tools, for, pipelines] |(20,[0,8,10,12,15,16],[1.0,1.0,1.0,1.0,1.0,2.0]) |(20,[0,8,10,12,15,16],[0.6931471805599453,0.28768207245178085,0.6931471805599453,0.0,0.28768207245178085,0.5753641449035617]) | # +---+----------------------------------------------------------+------------------------------------------------------------------+------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ >>> tfidf_df.take(1) # [Row(id=1, sentence=u'This is an introduction to Spark MLlib', words=[u'this', u'is', u'an', u'introduction', u'to', u'spark', u'mllib'], rawFeatures=SparseVector(20, {1: 2.0, 5: 1.0, 6: 1.0, 8: 1.0, 12: 1.0, 13: 1.0}), idf_features=SparseVector(20, {1: 0.5754, 5: 0.6931, 6: 0.2877, 8: 0.2877, 12: 0.0, 13: 0.2877}))]
def main(argv=None): if argv is None: inputs_train = sys.argv[1] inputs_test = sys.argv[2] conf = SparkConf().setAppName('sentiment-analysis-tfidf') sc = SparkContext(conf=conf) sqlCt = SQLContext(sc) #read train json file and prepare data (label, feature) text = sqlCt.read.json(inputs_train) train = text.select('overall', 'reviewText').withColumnRenamed('overall', 'label') train.cache() ## DATA PROCESSING PIPELINE # Split at whitespace and characters that are not letter tokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\P{Alpha}+") # stopword remover remover = StopWordsRemover(inputCol="words", outputCol="filtered_words") pipeline_data_processing = Pipeline(stages=[tokenizer, remover]) model_data_processing = pipeline_data_processing.fit(train) train_processed = model_data_processing.transform(train) train.unpersist() train_processed.cache() ## ML PIPELINE # TF-IDF Features hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=1000) idf = IDF(inputCol="rawFeatures", outputCol="features") # linear Regression Model lr = LinearRegression(maxIter=20, regParam=0.1) # Final Pipeline pipeline = Pipeline(stages=[hashingTF, idf, lr]) # FIT MODEL USING CROSS VALIDATION # Parameter grid for cross validation: numFeatures and regParam paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [5000, 10000, 20000, 50000]) \ .addGrid(lr.regParam, [0.001, 0.01, 0.1, 1.0]) \ .build() # 5-fold cross validation evaluator = RegressionEvaluator(metricName="rmse") crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) # Run cross-validation, and choose the best set of parameters. model = crossval.fit(train_processed) # RMSE on train data prediction_train = model.transform(train_processed) rmse_train = evaluator.evaluate(prediction_train) train_processed.unpersist() ## EVALUATION ON TEST DATA #read test json file and prepare data (label, feature) text = sqlCt.read.json(inputs_test) test = text.select('overall', 'reviewText').withColumnRenamed('overall', 'label') test_processed = model_data_processing.transform(test) # Evaluate the model on test data prediction_test = model.transform(test_processed) rmse_test = evaluator.evaluate(prediction_test) # Print Result result = "MODEL WITH TF_IDF - best no. features = " \ + str(model.bestModel.stages[0].getNumFeatures()) + ":\n" result = result + "-Train RMSE: " + str(rmse_train) + "\n" result = result + "-Test RMSE: " + str(rmse_test) + "\n" print(result)
def lda_optimal(self, preprocess_file=DEFAULT_PREPROCESSING_OUTPUT, cluster_df=CLUSTER_DF, maxiter=MAXITER, output_file_name=DEFAULT_OUTPUT_FILE, max_term_tagging=m): filter_number_udf = udf( lambda row: [x for x in row if not self.is_digit(x)], ArrayType(StringType())) temp = sqlContext.read.parquet(preprocess_file) temp = temp.withColumn('no_number_vector_removed', filter_number_udf(col('vector_no_stopw'))) temp1 = temp.select(temp.paper_id, explode(temp.no_number_vector_removed)) temp2 = temp1.filter(temp1.col != "") temp3 = temp2.groupby("paper_id").agg( F.collect_list("col").alias("vector_removed")) inner_join = temp3.join(temp, ["paper_id"]) windowSpec = Window.orderBy(F.col("paper_id")) df_final = inner_join.withColumn("id", F.row_number().over(windowSpec)) df_txts = df_final.select("vector_removed", "id", "paper_id", "doi", "title", "authors", "abstract", "abstract_summary", "vector_no_stopw") df = sqlContext.read.format("com.databricks.spark.csv").option( "header", "true").option("inferschema", "true").option("mode", "DROPMALFORMED").load("CLUSTER_DF") df_txts = df.join(df_txts, "paper_id" == "index") # TF cv = CountVectorizer(inputCol="vector_removed", outputCol="raw_features", vocabSize=5000, minDF=5.0) cvmodel = cv.fit(df_txts) result_cv = cvmodel.transform(df_txts) # IDF idf = IDF(inputCol="raw_features", outputCol="features") idfModel = idf.fit(result_cv) result_tfidf = idfModel.transform(result_cv) from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, StringType spark = SparkSession.builder.appName( 'SparkByExamples.com').getOrCreate() schema = StructType([ StructField('cluster_id', StringType(), True), StructField('tagging', ArrayType(), True) ]) topic_modeling = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema) distinct_clusters = result_tfidf.select( "cluster_id").distinct().sorted().collect_list() for i in distinct_clusters: subset = result_tfidf.filter(result_tfidf.cluster_id == i) lda = LDA(k=1, maxIter=100) ldaModel = lda.fit(result_subset) output = ldaModel.transform(result_tfidf) if (i == 0): full_df = output else: full_df = full_df.union(output) topics = ldaModel.describeTopics(maxTermsPerTopic=m) vocabArray = cvmodel.vocabulary ListOfIndexToWords = udf( lambda wl: list([vocabArray[w] for w in wl])) FormatNumbers = udf(lambda nl: ["{:1.4f}".format(x) for x in nl]) taggings = topics.select( ListOfIndexToWords(topics.termIndices).alias('words')) temp = spark.createDataFrame([(i, taggings)], ['cluster_id', 'taggings']) topic_modeling = topic_modeling.union(temp) # output the taggings of each topic topic_modeling.to_csv(output_file_name) return full_df
' ')) # Text to tokens tokenizer = Tokenizer(inputCol="text", outputCol="words") tokenized = tokenizer.transform(wrangled) # Remove stop words. remover = StopWordsRemover(inputCol="words", outputCol="terms") removed = remover.transform(tokenized) # Apply the hashing trick hasher = HashingTF(inputCol="terms", outputCol="hash", numFeatures=1024) hashed = hasher.transform(removed) # Convert hashed symbols to TF-IDF idf = IDF(inputCol="hash", outputCol="features") sms = idf.fit(hashed).transform(hashed) # View the first four records sms.show(4, truncate=False) # Split the data into training and testing sets sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed=13) # Fit a Logistic Regression model to the training data logistic = LogisticRegression(regParam=0.2) logistic = logistic.fit(sms_train) # Make predictions on the testing data prediction = logistic.transform(sms_test)
################# Tokenize the data pre_process = udf( lambda x: re.sub(r'[^A-Za-z\n ]|(http\S+)|(www.\S+)', '', \ x.lower().strip()).split(), ArrayType(StringType()) ) df = df.withColumn("cleaned_data", pre_process(df.message)).dropna() ################# Split the dataframe into training and testing train, test = df.randomSplit([0.8, 0.2], seed=100) ################# Create an ML Pipeline # Peforms TF-IDF calculation and Logistic Regression remover = StopWordsRemover(inputCol="cleaned_data", outputCol="words") vector_tf = CountVectorizer(inputCol="words", outputCol="tf") idf = IDF(inputCol="tf", outputCol="features", minDocFreq=3) label_indexer = StringIndexer(inputCol="sentiment", outputCol="label") logistic_regression = LogisticRegression(maxIter=100) pipeline = Pipeline( stages=[remover, vector_tf, idf, label_indexer, logistic_regression]) ################# Fit the pipeline to the training dataframe trained_model = pipeline.fit(train) ''' The labels are labelled with positive (4) as 0.0 negative (0) as 1.0 ''' ################# Predicting the test dataframe and calculating accuracy prediction_df = trained_model.transform(test)
def main(): spark = SparkSession.builder.appName('AmazonReviewsSparkProcessor').getOrCreate() # Convert command line args into a map of args args_iter = iter(sys.argv[1:]) args = dict(zip(args_iter, args_iter)) # Retrieve the args and replace 's3://' with 's3a://' (used by Spark) s3_input_data = args['s3_input_data'].replace('s3://', 's3a://') print(s3_input_data) s3_output_data = args['s3_output_data'].replace('s3://', 's3a://') print(s3_output_data) schema = StructType([ StructField('is_positive_sentiment', IntegerType(), True), StructField('marketplace', StringType(), True), StructField('customer_id', StringType(), True), StructField('review_id', StringType(), True), StructField('product_id', StringType(), True), StructField('product_parent', StringType(), True), StructField('product_title', StringType(), True), StructField('product_category', StringType(), True), StructField('star_rating', IntegerType(), True), StructField('helpful_votes', IntegerType(), True), StructField('total_votes', IntegerType(), True), StructField('vine', StringType(), True), StructField('verified_purchase', StringType(), True), StructField('review_headline', StringType(), True), StructField('review_body', StringType(), True), StructField('review_date', StringType(), True) ]) df_csv = spark.read.csv(path=s3_input_data, schema=schema, header=True, quote=None) df_csv.show() # This dataset should already be clean, but always good to double-check print('Showing null review_body rows...') df_csv.where(col('review_body').isNull()).show() df_csv_cleaned = df_csv.na.drop(subset=['review_body']) df_csv_cleaned.where(col('review_body').isNull()).show() tokenizer = Tokenizer(inputCol='review_body', outputCol='words') wordsData = tokenizer.transform(df_csv_cleaned) hashingTF = HashingTF(inputCol='words', outputCol='raw_features', numFeatures=1000) featurizedData = hashingTF.transform(wordsData) # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: # 1) compute the IDF vector # 2) scale the term frequencies by IDF # Therefore, we cache the result of the HashingTF transformation above to speed up the 2nd pass featurizedData.cache() # spark.mllib's IDF implementation provides an option for ignoring terms # which occur in less than a minimum number of documents. # In such cases, the IDF for these terms is set to 0. # This feature can be used by passing the minDocFreq value to the IDF constructor. idf = IDF(inputCol='raw_features', outputCol='features') #, minDocFreq=2) idfModel = idf.fit(featurizedData) features_df = idfModel.transform(featurizedData) features_df.select('is_positive_sentiment', 'features').show() # TODO: Use SVD instead # features_vector_rdd = features_df.select('features').rdd.map( lambda row: Vectors.fromML(row.getAs[MLVector]('features') ) # features_vector_rdd.cache() # mat = RowMatrix(features_vector_rdd) # k = 300 # svd = mat.computeSVD(k, computeU=True) # TODO: Reconstruct num_features=300 pca = PCA(k=num_features, inputCol='features', outputCol='pca_features') pca_model = pca.fit(features_df) pca_features_df = pca_model.transform(features_df).select('is_positive_sentiment', 'pca_features') pca_features_df.show(truncate=False) standard_scaler = StandardScaler(inputCol='pca_features', outputCol='scaled_pca_features') standard_scaler_model = standard_scaler.fit(pca_features_df) standard_scaler_features_df = standard_scaler_model.transform(pca_features_df).select('is_positive_sentiment', 'scaled_pca_features') standard_scaler_features_df.show(truncate=False) expanded_features_df = (standard_scaler_features_df.withColumn('f', to_array(col('scaled_pca_features'))) .select(['is_positive_sentiment'] + [col('f')[i] for i in range(num_features)])) expanded_features_df.show() # Remover overwrite to test for this issue # https://stackoverflow.com/questions/51050591/spark-throws-java-io-ioexception-failed-to-rename-when-saving-part-xxxxx-gz expanded_features_df.write.csv(path=s3_output_data, header=None, quote=None) #, # mode='overwrite') print('Wrote to output file: {}'.format(s3_output_data))
from sklearn.datasets import fetch_20newsgroups import pandas as pd from pyspark.ml import Pipeline from pyspark.ml.feature import RegexTokenizer,HashingTF,IDF from pyspark.ml.classification import NaiveBayes from pyspark.sql.functions import udf,explode,size sns.set() spark = SparkSession.builder.appName("TU-1").getOrCreate() train = fetch_20newsgroups(subset='train') test = fetch_20newsgroups(subset='test') p_train=pd.DataFrame({'data':train.data,'target':train.target,'filenames':train.filenames}) p_test=pd.DataFrame({'data':test.data,'target':test.target,'filenames':test.filenames}) s_train = spark.createDataFrame(p_train) s_test = spark.createDataFrame(p_test) tokenizer = RegexTokenizer(inputCol='data',outputCol='words',pattern='\\W') termFreq = HashingTF(inputCol='words',outputCol='freq') idf = IDF(inputCol='freq',outputCol='tfidf') nb = NaiveBayes(featuresCol="tfidf", labelCol="target") pipeline = Pipeline(stages=[tokenizer,termFreq,idf,nb]) model = pipeline.fit(s_train) data = model.transform(s_test) p_data = data.sample(False,0.5).limit(500).toPandas() mat = confusion_matrix(p_data.target,p_data.prediction) sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, xticklabels=train.target_names, yticklabels=train.target_names)
.setOutputCol("filtered") from pyspark.ml.feature import CountVectorizer # we will remove words that appear in 5 docs or less cv = CountVectorizer(minTF=1., minDF=5., vocabSize=2**17)\ .setInputCol("filtered")\ .setOutputCol("tf") # we now create a pipelined transformer cv_pipeline = Pipeline(stages=[tokenizer, sw_filter, cv]).fit(review) cv_pipeline.transform(review).show(5) from pyspark.ml.feature import IDF idf = IDF().\ setInputCol('tf').\ setOutputCol('tfidf') idf_pipeline = Pipeline(stages=[cv_pipeline, idf]).fit(review) tfidf_df = idf_pipeline.transform(review) tfidf_df.show(10) #training_df, validation_df, testing_df = review.randomSplit([0.6, 0.3, 0.1], seed=0) #training_df, validation_df, testing_df = review.randomSplit([0.6, 0.3, 0.1], seed=0) #[training_df.count(), validation_df.count(), testing_df.count()] import pandas as pd training_df, validation_df, testing_df = review.randomSplit([0.6, 0.3, 0.1],
smsDf = sqlContext.createDataFrame(smsXformed, ["label", "message"]) smsDf.cache() smsDf.select("label", "message").show() #Split training and testing (trainingData, testData) = smsDf.randomSplit([0.9, 0.1]) trainingData.count() testData.count() testData.collect() #Setup pipeline from pyspark.ml.classification import NaiveBayes, NaiveBayesModel from pyspark.ml import Pipeline from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.ml.feature import IDF tokenizer = Tokenizer(inputCol="message", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), \ outputCol="tempfeatures") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") nbClassifier = NaiveBayes() pipeline = Pipeline(stages=[tokenizer, hashingTF, \ idf, nbClassifier]) nbModel = pipeline.fit(trainingData) prediction = nbModel.transform(testData) prediction.groupBy("label", "prediction").count().show()
tfIdfIn = tokenized\ .where("array_contains(DescOut, 'red')")\ .select("DescOut")\ .limit(10) tfIdfIn.show(10, False) # COMMAND ---------- from pyspark.ml.feature import HashingTF, IDF tf = HashingTF()\ .setInputCol("DescOut")\ .setOutputCol("TFOut")\ .setNumFeatures(10000) idf = IDF()\ .setInputCol("TFOut")\ .setOutputCol("IDFOut")\ .setMinDocFreq(2) # COMMAND ---------- idf.fit(tf.transform(tfIdfIn)).transform(tf.transform(tfIdfIn)).show(10, False) # COMMAND ---------- from pyspark.ml.feature import Word2Vec # Input data: Each row is a bag of words from a sentence or document. documentDF = spark.createDataFrame( [("Hi I heard about Spark".split(" "), ), ("I wish Java could use case classes".split(" "), ), ("Logistic regression models are neat".split(" "), )], ["text"]) # Learn a mapping from words to Vectors.
,StructField("id", StringType(), True)\ ,StructField("date", StringType(), True)\ ,StructField("flag", StringType(), True)\ ,StructField("user", StringType(), True)\ ,StructField("body", StringType(), True)]) df = spark.createDataFrame(data, schema=mySchema) df.show(5) # Create training, validation, and test sets (train_set, val_set, test_set) = df.randomSplit([0.98, 0.01, 0.01], seed=2000) # Prepare TF-IDF + Logistic Regression Model tokenizer = Tokenizer(inputCol="body", outputCol="words") hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf') idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms label_stringIdx = StringIndexer(inputCol="target", outputCol="label") pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx]) pipelineFit = pipeline.fit(train_set) train_df = pipelineFit.transform(train_set) val_df = pipelineFit.transform(val_set) train_df.show(5) # Train Model lr = LogisticRegression(maxIter=20) lrModel = lr.fit(train_df) predictions = lrModel.transform(val_df) # Evaluate Model from pyspark.ml.evaluation import BinaryClassificationEvaluator
plt.figure(figsize=(10, 7)) sn.heatmap(df_cm, annot=True) plt.savefig('myfig_1.png') print("\n") print("=" * 40) print("Running Logistic Regression using TF-IDF Features. Please wait.") start = time.time() # Maps a sequence of terms to their term frequencies using the hashing trick. hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000) # Compute the Inverse Document Frequency (IDF) given a collection of documents. # minDocFreq: Minimum number of documents in which a term should appear for filtering' idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) pipeline = Pipeline( stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx]) pipelineFit = pipeline.fit(df) dataset = pipelineFit.transform(df) (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100) lr = LogisticRegression(maxIter=20, regParam=0.1, elasticNetParam=0.2) lrModel = lr.fit(trainingData) predictions = lrModel.transform(testData) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") end = time.time() print("Accuracy:\t\t" + str(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}))) print("Weighted Precision:\t" + str(
print("countvectorize") df = CountVectorizer(inputCol="words", outputCol="countVector", vocabSize=2000, minDF=8.0)\ .fit(df)\ .transform(df)\ .select("countVector","overall") df.show(truncate=False) ### from pyspark.ml.feature import IDF print("tfidf") df = IDF(inputCol="countVector", outputCol="tfidf").fit(df).transform(df).select("tfidf", "overall") df.show() ### from pyspark.ml.feature import PCA print("pca") df = PCA(k=300, inputCol="tfidf", outputCol="pca").fit(df).transform(df).select("pca", "overall") df.show() #df.show(truncate=False) ### from pyspark.ml.regression import RandomForestRegressor
return temp_dict ## Converting RDD to DataFrame df = processed_papers.map(lambda record: Row(**row_conversion(record))).toDF() #df.printSchema() ## Featurizing processed text into TF-IDF vectors cv = sparkCountVectorizer(inputCol='body_text', outputCol='tf_vector') cv_model = cv.fit(df) tf_df = cv_model.transform( df) ## New column tf_vector with respective term-frequency vectors ## Standardizing TF vectors into TF-IDF vectors idf = IDF(inputCol='tf_vector', outputCol='tfidf_vector') idf_model = idf.fit(tf_df) tfidf_df = idf_model.transform( tf_df) ## New column tfidf_vector with respective TF-IDF vectors ## Helper function to convert sparse vector to dense vector def sparse_to_dense(v): v = DenseVector(v) dense_vector = list([float(x) for x in v]) return dense_vector ## Converting back to RDD papers_rdd = tfidf_df.select('paper_id', 'tfidf_vector').rdd.map( lambda t: (t['paper_id'], sparse_to_dense(t['tfidf_vector'])))
def main(): spark = SparkSession.builder \ .appName("Spark CV-job ad matching") \ .config("spark.some.config.option", "some-value") \ .master("local[*]") \ .getOrCreate() NUM_FEATURES = 2**8 df_jobs = spark.read.json("alljobs4rdd/alljobs.jsonl").filter("description is not NULL").cache() df_jobs.registerTempTable("jobs") df_cvs = spark.read.json("allcvs4rdd/allcvs.jsonl").cache() df_cvs.registerTempTable("cvs") df_categories = spark.read.json("allcategories4rdd/allcategories.jsonl").cache() df_categories.registerTempTable("categories") joined = spark.sql("SELECT description AS text, jobId AS id, 'job' AS type FROM jobs UNION ALL \ SELECT description AS text, cvid AS id, 'cv' AS type FROM cvs UNION ALL \ SELECT skillText AS text, id AS id, 'categories' AS type FROM categories") tokenizer = Tokenizer(inputCol="text", outputCol="words") tokenized = tokenizer.transform(joined) remover = StopWordsRemover(inputCol="words", outputCol="filtered") removed = remover.transform(tokenized) hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=NUM_FEATURES) featurizedData = hashingTF.transform(removed) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.registerTempTable("resultTable") jobs = spark.sql("SELECT features, id AS jobId FROM resultTable WHERE type = 'job'") cvs = spark.sql("SELECT features AS featuresCV, id AS cvid FROM resultTable WHERE type = 'cv'") categories = spark.sql("SELECT features AS featuresCAT, cat.id, cat.skillName AS skillName, category FROM resultTable AS rt\ LEFT JOIN categories AS cat ON rt.id = cat.id WHERE type = 'categories'") #Calculate job-cv similarity START crossJoined = jobs.select("jobId", "features").crossJoin(cvs.select("cvid", "featuresCV")) calculatedDF = crossJoined.rdd.map(lambda x: (x.jobId, x.cvid, calculate_distance(x.features, x.featuresCV)))\ .toDF(["jobid", "cvid", "distance"]) ordered = calculatedDF.orderBy(asc("jobid")).coalesce(2) ordered.write.csv('Calculated/tfidf/job-cv') #Calculate job-cv similarity END #Calculate cv-category similarity START crossJoined_cat_cv = cvs.select("cvid", "featuresCV").crossJoin(categories.select("id", "skillName", "featuresCAT", "category")) calculatedDF_cat_cv = crossJoined_cat_cv.rdd\ .map(lambda x: (x.cvid, x.id, x.skillName, x.category, calculate_distance(x.featuresCV, x.featuresCAT)))\ .toDF(["cvid", "catid", "skillName", "category", "distance"]) ordered_cat_cv = calculatedDF_cat_cv.orderBy(asc("cvid"), asc("distance")).coalesce(2) ordered_cat_cv.write.csv('Calculated/tfidf/cv-category') #Calculate cv-category similarity END #Job-category START crossJoined_job_cat = jobs.select("jobId", "features").crossJoin(categories.select("id", "skillName", "featuresCAT", "category")) calculatedDF_job_cat = crossJoined_job_cat.rdd\ .map(lambda x: (x.jobId, x.id, x.skillName, x.category, calculate_distance(x.features, x.featuresCAT)))\ .toDF(["jobid", "catid", "skillName", "category", "distance"]) ordered_job_cat = calculatedDF_job_cat.orderBy( asc("distance")).coalesce(2) ordered_job_cat.write.csv('Calculated/tfidf/job-category')
swr = StopWordsRemover(inputCol='text_token', outputCol='text_sw_removed') reviews_swr = swr.transform(reviews_token) reviews_swr.show(3) # In[9]: # Word Term Frequency from pyspark.ml.feature import CountVectorizer cv = CountVectorizer(inputCol="text_sw_removed", outputCol="tf") cv_model = cv.fit(reviews_swr) reviews_cv = cv_model.transform(reviews_swr) reviews_cv.show(3) # In[10]: # TF-IDF from pyspark.ml.feature import IDF idf = IDF(inputCol="tf", outputCol="features") idf_model = idf.fit(reviews_cv) reviews_tfidf = idf_model.transform(reviews_cv) reviews_tfidf.show(3) # In[11]: # Predict Rating Score (Repeat What we did in Lecture 10) gradings = reviews_tfidf.select('funny', 'cool', 'useful', 'stars').toPandas() sns.distplot(gradings['funny']) sns.distplot(gradings['cool']) sns.distplot(gradings['useful']) sns.distplot(gradings['stars']) from pyspark.ml.feature import StringIndexer stringIdx = StringIndexer(inputCol="stars", outputCol="label") final = stringIdx.fit(reviews_tfidf).transform(reviews_tfidf)
from pyspark.ml.feature import HashingTF, IDF, Tokenizer sentenceData = spark.createDataFrame( [(0.0, "Hi I heard about Spark"), (0.0, "I wish Java could use case classes"), (1.0, "Logistic regression models are neat")], ["label", "sentence"]) sentenceData.show() tokenizer = Tokenizer(inputCol='sentence', outputCol='words') words_data = tokenizer.transform(sentenceData) words_data.show(truncate=False) hashing_tf = HashingTF(inputCol='words', outputCol='rawFeatures') featurized_data = hashing_tf.tranfsorm(words_data) idf = IDF(inputCol='rawFeatures', outputCol='features') idf_model = idf.fit(featurized_data) rescaled_data = idf_model.transform(featurized_data) rescaled_data.select('label', 'features').show(truncate=False) from pyspark.ml.feature import CountVectorizer df = spark.createDataFrame([(0, "a,b,c".split(" ")), (1, "a b b c a".split(" "))], ["id", "words"]) df.show() cv = CountVectorizer(inputCol='words', outputCol='features', vocabSize=3, minDF=2.0)
pattern="[a-zA-Z]+") ## Remove ignored words stopWordsRemover = StopWordsRemover( inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) if algo == "gbm": ## Create GBM model algoStage = H2OGBM(ratio=0.8, seed=1, featuresCols=[idf.getOutputCol()], predictionCol="label") elif algo == "dl": ## Create H2ODeepLearning model algoStage = H2ODeepLearning(epochs=10, seed=1, l1=0.001, l2=0.0, hidden=[200, 200], featuresCols=[idf.getOutputCol()],
# Make predictions on the testing data predictions = pipeline.transform(flights_test) -------------------------------------------------- # Exercise_3 from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF # Break text into tokens at non-word characters tokenizer = Tokenizer(inputCol='text', outputCol='words') # Remove stop words remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms') # Apply the hashing trick and transform to TF-IDF hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash") idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features") # Create a logistic regression object and add everything to a pipeline logistic = LogisticRegression() pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic]) -------------------------------------------------- # Exercise_4 # Create an empty parameter grid params = ParamGridBuilder().build() # Create objects for building and evaluating a regression model regression = LinearRegression(labelCol='duration') evaluator = RegressionEvaluator(labelCol='duration') # Create a cross validator
def main(sc, sqlContext): #start = timer() #print '---Pegando usuario, posts, tokens e categorias do MongoDB---' #start_i = timer() user = findUserById(iduser) posts = findPosts(user) tokens, category, categoryAndSubcategory = getTokensAndCategories() postsRDD = (sc.parallelize(posts).map(lambda s: (s[ 0], word_tokenize(s[1].lower()), s[2], s[3])).map(lambda p: (p[ 0], [x for x in p[1] if x in tokens], p[2], p[3])).cache()) #print '####levou %d segundos' % (timer() - start_i) #print '---Pegando produtos do MongoDB---' #start_i = timer() #print '####levou %d segundos' % (timer() - start_i) #print '---Criando corpusRDD---' #start_i = timer() stpwrds = stopwords.words('portuguese') corpusRDD = (postsRDD.map(lambda s: (s[0], [ PorterStemmer().stem(x) for x in s[1] if x not in stpwrds ], s[2], s[3])).filter(lambda x: len(x[1]) >= 20 or (x[2] == u'Post' and len(x[1]) > 0)).cache()) #print '####levou %d segundos' % (timer() - start_i) #print '---Calculando TF-IDF---' #start_i = timer() wordsData = corpusRDD.map( lambda s: Row(label=int(s[0]), words=s[1], type=s[2])) wordsDataDF = sqlContext.createDataFrame(wordsData).unionAll( sqlContext.read.parquet( "/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet")) numTokens = len(tokens) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens) idf = IDF(inputCol="rawFeatures", outputCol="features") featurizedData = hashingTF.transform(wordsDataDF) idfModel = idf.fit(featurizedData) tfIDF = idfModel.transform(featurizedData).cache() postTFIDF = ( tfIDF.filter(tfIDF.type == u'Post') #.map(lambda s: Row(label=s[0], type=s[1], words=s[2], rawFeatures=s[3], features=s[4], sentiment=SVM.predict(s[4]))) .cache()) #postTFIDF = postTFIDF.filter(lambda p: p.sentiment == 1) #print '####levou %d segundos' % (timer() - start_i) #print '---Carregando modelo---' #start_i = timer() NB = NaiveBayesModel.load( sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria') SVM = SVMModel.load(sc, "/home/ubuntu/recsys-tcc-ml/models/svm") #print '####levou %d segundos' % (timer() - start_i) #print '---Usando o modelo---' #start_i = timer() predictions = (postTFIDF.map(lambda p: (NB.predict(p.features), p[ 0], SVM.predict(p.features))).filter(lambda p: p[2] == 1).map( lambda p: (p[0], p[1])).groupByKey().mapValues(list).collect()) #print '####levou %d segundos' % (timer() - start_i) #print '---Calculando similaridades---' #start_i = timer() suggestions = [] for prediction in predictions: category_to_use = category[int(prediction[0])] #print ' Calculando similaridades para a categoria: {}'.format(category_to_use) tf = tfIDF.filter(tfIDF.type == category_to_use).cache() for post in prediction[1]: postVector = postTFIDF.filter( postTFIDF.label == post).map(lambda x: x.features).collect()[0] sim = (tf.map(lambda x: ( post, x.label, cossine(x.features, postVector))).filter( lambda x: x[2] >= threshold).collect()) if len(sim) > 0: suggestions.append(sim) #print '####levou %d segundos' % (timer() - start_i) if len(suggestions) > 0: #print '---Inserindo recomendacoes no MongoDB---' #start_i = timer() insertSuggestions(suggestions, iduser, posts)
print "Text is cleaned" sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ['review', 'label']) dfTrain, dfTest = df.randomSplit([0.8, 0.2]) print "Random split is done" tokenizerNoSw = tr.NLTKWordPunctTokenizer( inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words('english'))) hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf') idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') dt = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxIter=30, regParam=0.01) pipeline = Pipeline( stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt]) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') # grid=(ParamGridBuilder() # .baseOn([evaluator.metricName,'precision'])