plt.show() y1Val.sort() y2Val.sort() plt.plot(x, y1Val, 'go--', linewidth=2, markersize=0, label='Non-Spoiler') plt.plot(x, y2Val, 'ro-', linewidth=2, markersize=0, label='Spoiler') plt.ylabel('Average Sentence Length') plt.legend(loc='upper left') plt.title('Review Sentence Length') plt.savefig(IMG_PATH + 'lengthSorted.png', format='png', transparent=True) plt.show() # %% stopWords = list(set(nltk.corpus.stopwords.words('english'))) + [''] tokenizer = Tokenizer(inputCol='review', outputCol='tokens') stopWordRemover = StopWordsRemover( inputCol=tokenizer.getOutputCol(), outputCol='stoppedWords').setStopWords(stopWords) pipeline = Pipeline(stages=[tokenizer, stopWordRemover]) dataSet = pipeline.fit(dataSet).transform(dataSet) # %% newLengthDF = dataSet.withColumn('newLength', F.size(stopWordRemover.getOutputCol())) # %% newSentenceLen = newLengthDF.select('class', 'newLength').collect() #%% y = [
def preprocess_files(bucket_name, file_name): raw_data = sql_context.read.parquet("s3a://{0}/{1}".format( bucket_name, file_name)) #LIMIT TO 10 initially unanswered_questions = raw_data.filter(raw_data.PostTypeId == 1).filter( raw_data.AcceptedAnswerId.isNull()) print(unanswered_questions.count()) # Clean article text print(colored("[PROCESSING]: Cleaning post body", "green")) clean_body = F.udf(lambda body: filter_body(body), StringType()) clean_article_data = unanswered_questions.withColumn( "cleaned_body", clean_body("Body")) # Tokenize article text print(colored("[PROCESSING]: Tokenizing text vector...", "green")) tokenizer = Tokenizer(inputCol="cleaned_body", outputCol="text_body_tokenized") tokenized_data = tokenizer.transform(clean_article_data) print("tokenized_data") # tokenized_data.show() # Remove stop words print(colored("[PROCESSING]: Removing stop words", "green")) stop_words_remover = StopWordsRemover( inputCol="text_body_tokenized", outputCol="text_body_stop_words_removed") stop_words_removed_data = stop_words_remover.transform(tokenized_data) print("stop_words_removed_data") # stop_words_removed_data.show() # Stem words print(colored("Stemming tokenized text", "green")) stem = F.udf(lambda tokens: lemmatize(tokens), ArrayType(StringType())) stemmed_data = stop_words_removed_data.withColumn( "text_body_stemmed", stem("text_body_stop_words_removed")) print("stemmed_data") # stemmed_data.show() # Shingle resulting body print(colored("Shingling resulting text", "green")) shingle = F.udf(lambda tokens: get_n_gram_shingles(tokens, 3), StringType()) shingled_data = stemmed_data.withColumn("text_body_shingled", shingle("text_body_stemmed")) shingle_table = shingled_data.select('Id', 'text_body_shingled') print(colored("Adding category/id mappings to Redis", "green")) print("shingle_table") shingle_table.head() # Create a mapping of article categories to article id's that fall under that category. Each key is an article category and the values the list of article id's. cat_id_map = unanswered_questions.select( F.explode('Tags').alias('Tag'), 'Id').groupBy(F.col('Tag')).agg( F.collect_list('Id').alias('Ids_list')).where( F.size(F.col('Ids_list')) < 200).withColumn( 'Ids', to_str_udf('Ids_list')) print(colored("Beginning writing category/id mapping to Redis", "green")) def write_cat_id_map_to_redis(rdd): rdb = redis.StrictRedis( host="ec2-52-73-233-196.compute-1.amazonaws.com", port=6379, db=1) for row in rdd: rdb.sadd('cat:{}'.format(row.Tag), row.Ids) cat_id_map.foreachPartition(write_cat_id_map_to_redis) print(cat_id_map.show(5, True)) print(colored("Finished writing category/id mapping to Redis", "green")) #Minhash calculations k = 100 random_seed = 50 masks = (np.random.RandomState(seed=random_seed).randint( np.iinfo(np.int64).min, np.iinfo(np.int64).max, k)) def update_min_hash_signature(word, min_hash_signature): root_hash = mmh3.hash64(pickle.dumps(word))[0] word_hashes = np.bitwise_xor( masks, root_hash ) # XOR root hash with k randomly generated integers to simulate k hash functions min_hash_signature = np.minimum(min_hash_signature, word_hashes) return min_hash_signature def calc_min_hash_signature(tokens): min_hash_signature = np.empty(k, dtype=np.int64) min_hash_signature.fill(np.iinfo(np.int64).max) for token in tokens: min_hash_signature = update_min_hash_signature( token, min_hash_signature) return min_hash_signature def compute_minhash(df): calc_min_hash_udf = F.udf( lambda x: str( list(map(lambda x: int(x), calc_min_hash_signature(x)))), StringType()) df = df.withColumn("min_hash", calc_min_hash_udf("text_body_shingled")).select( 'id', 'min_hash') return df print(colored("Computing minhash values", "green")) minhash_df = compute_minhash(shingle_table) print(colored("Finished computing minhash values", "green")) print(colored("Beginning writing minhash data to Redis", "green")) # Write minhash data to redis. If pipeline=True, use pipeline # method of inserting data in Redis def write_minhash_data_to_redis(rdd): rdb = redis.StrictRedis( host="ec2-52-73-233-196.compute-1.amazonaws.com", port=6379, db=1) for row in rdd: rdb.sadd('id:{}'.format(row.id), row.min_hash) #print(minhash_df.show(5, True)) minhash_df.foreachPartition(write_minhash_data_to_redis) print(colored("Finished writing minhash data to Redis", "green"))
old_df = new_df p1.close() for f2 in listdir(hampath): p2 = open(hampath+f2, 'r') #create a temporary dataframe and append a label '0.0' to ever ham emai temp2 = spark.createDataFrame([(p2.read(), 0.0)], ['text', 'label']) new_df = old_df.unionAll(temp2) old_df = new_df p2.close() final = old_df (training, test) = final.randomSplit([0.8, 0.2]) #split the dataframe into training and test # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") #now create a tokenizer for getting word of the email hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") #create a feature for every word lr = LogisticRegression(maxIter=10, regParam=0.001) #make a logistic regression model pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training documents. model = pipeline.fit(training) #fit the model based on training data # Make predictions on test documents and print columns of interest. prediction = model.transform(test) #make predictions on test data based on model prediction.show(1) #show the columns of interest #Accuracy calculation evaluator1 = MulticlassClassificationEvaluator(
mySchema = StructType([ StructField("target", IntegerType(), True)\ ,StructField("id", StringType(), True)\ ,StructField("date", StringType(), True)\ ,StructField("flag", StringType(), True)\ ,StructField("user", StringType(), True)\ ,StructField("body", StringType(), True)]) df = spark.createDataFrame(data, schema=mySchema) df.show(5) # Create training, validation, and test sets (train_set, val_set, test_set) = df.randomSplit([0.98, 0.01, 0.01], seed=2000) # Prepare TF-IDF + Logistic Regression Model tokenizer = Tokenizer(inputCol="body", outputCol="words") hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf') idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms label_stringIdx = StringIndexer(inputCol="target", outputCol="label") pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx]) pipelineFit = pipeline.fit(train_set) train_df = pipelineFit.transform(train_set) val_df = pipelineFit.transform(val_set) train_df.show(5) # Train Model lr = LogisticRegression(maxIter=20) lrModel = lr.fit(train_df) predictions = lrModel.transform(val_df)
from pyspark.sql import SparkSession spark = SparkSession.builder.appName('nlp').getOrCreate() from pyspark.ml.feature import HashingTF, IDF, Tokenizer sentenceData = spark.createDataFrame( [(0.0, "Hi I heard about Spark"), (0.0, "I wish Java could use case classes"), (1.0, "Logistic regression models are neat")], ["label", "sentence"]) sentenceData.show() tokenizer = Tokenizer(inputCol='sentence', outputCol='words') words_data = tokenizer.transform(sentenceData) words_data.show(truncate=False) hashing_tf = HashingTF(inputCol='words', outputCol='rawFeatures') featurized_data = hashing_tf.tranfsorm(words_data) idf = IDF(inputCol='rawFeatures', outputCol='features') idf_model = idf.fit(featurized_data) rescaled_data = idf_model.transform(featurized_data) rescaled_data.select('label', 'features').show(truncate=False) from pyspark.ml.feature import CountVectorizer df = spark.createDataFrame([(0, "a,b,c".split(" ")), (1, "a b b c a".split(" "))], ["id", "words"]) df.show()
def tokenizer(dataset, inputCol): from pyspark.ml.feature import Tokenizer return Tokenizer(inputCol=inputCol, outputCol=inputCol+'_tkn').transform(dataset)
sc = SparkContext(conf = conf) spark = SparkSession(sc) sqlContext = SQLContext(sc) df = spark.read.csv('file:////home/ubuntu/ys-180326/Dataset150.csv', header = True) data = df.rdd.map(list) print(data.first()) score = data.map(lambda s : 1.0 if s[1].isdigit() and float(s[1])==1.0 else 0.0 ) comment = data.map(lambda s : s[3]) split_neg_data2 = score.zip(comment) tranform_data = split_neg_data2.map(lambda p : (p[0],p[1]))#.toDF()#.withColumnRenamed('_1','label') #tranform_data.show() #sentenceData = spark.createDataFrame([(0, "I heard about Spark and I love Spark"),(0, "I wish Java could use case classes"),(1, "Logistic regression models are neat")]).toDF("label", "sentence") sentenceData = spark.createDataFrame(tranform_data,["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) #计算TF-IDF hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=3000) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features").show() forData = StringIndexer().setInputCol("label").setOutputCol("indexed").fit(rescaledData).transform(rescaledData) (trainingData, testData) = forData.randomSplit([0.8,0.2],seed=0) print(trainingData.take(1)) nb = NaiveBayes(smoothing=1.0, modelType="multinomial",labelCol="indexed") start_time = time.time()
spark = SparkSession.builder.appName("TfIdf Example").getOrCreate() documentData = spark.createDataFrame([ (0.0, Doc1), (0.1, Doc2), (0.2, Doc3), (0.3, Doc4), (0.5, Doc5) ], ["label", "document"]) #Printing the data documentData.show() #Performing Tokenization for the data tokenizer = Tokenizer(inputCol="document", outputCol="words") wordsData = tokenizer.transform(documentData) wordsData.show() """# **2.a) Performing the task without NLP**""" # applying tf on the words data hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=200) tf = hashingTF.transform(wordsData) # alternatively, CountVectorizer can also be used to get term frequency vectors # calculating the IDF tf.cache() idf = IDF(inputCol="rawFeatures", outputCol="features") idf = idf.fit(tf) tfidf = idf.transform(tf) #displaying the results
F.to_date('PublishDate', "yyyy-MM-dd HH:mm:ss")) df_timestamped = df_news.select(['PublishDate1', 'Topic', 'Title', 'Headline']) # drop duplicates #df_timestamped = df_timestamped.dropDuplicates(['Title', 'Headline']) # remove punctuation from text data, text to lower case and trim whitespaces df_timestamped = df_timestamped.withColumn( 'Title', F.trim(F.lower(F.regexp_replace(F.col('Title'), '[^\sa-zA-Z0-9]', '')))) df_timestamped = df_timestamped.withColumn( 'Headline', F.trim(F.lower(F.regexp_replace(F.col('Headline'), '[^\sa-zA-Z0-9]', '')))) # tokenize titles and headlines title_tokenizer = Tokenizer(inputCol='Title', outputCol='Title_words') headline_tokenizer = Tokenizer(inputCol='Headline', outputCol='Headline_words') df_timestamped = title_tokenizer.transform(df_timestamped) df_timestamped = headline_tokenizer.transform(df_timestamped) # remove stop words titel_remover = StopWordsRemover(inputCol='Title_words', outputCol='Title_final') headline_remover = StopWordsRemover(inputCol='Headline_words', outputCol='Headline_final') df_timestamped = titel_remover.transform(df_timestamped) df_timestamped = headline_remover.transform(df_timestamped) # simplify dataframe df_timestamped = df_timestamped.select( F.col('PublishDate1').alias('PublishDate'), F.col('Topic'),
def main(input_dir, output_dir): # main logic starts here df_schema = types.StructType([ types.StructField('title_clean', types.StringType()), types.StructField('polarity_subjectivity', types.ArrayType(types.FloatType())), types.StructField('score', types.LongType()), types.StructField('num_comments', types.LongType()), ]) headlines_df = spark.read.json(input_dir, encoding='utf-8', schema=df_schema).repartition(80) split_sentiment_df = headlines_df.withColumn( 'polarity', functions.element_at(headlines_df['polarity_subjectivity'], 1)).withColumn( 'subjectivity', functions.element_at( headlines_df['polarity_subjectivity'], 2)) df_sentiment = split_sentiment_df.withColumn( 'label', get_label(split_sentiment_df['polarity'])) training_set, validation_set = df_sentiment.randomSplit([0.75, 0.25]) headline_vector_size = 3 word_freq_vector_size = 100 tokenizer = Tokenizer(inputCol='title_clean', outputCol='words') headline2Vector = Word2Vec(vectorSize=headline_vector_size, minCount=0, inputCol='words', outputCol='headline_vector') hashingTF = HashingTF(inputCol='words', outputCol='word_counts', numFeatures=word_freq_vector_size) idf = IDF(inputCol='word_counts', outputCol='word_frequecy', minDocFreq=5) headline_vector_size_hint = VectorSizeHint( inputCol='headline_vector', size=headline_vector_size) #need this for streaming word_freq_vector_size_hint = VectorSizeHint( inputCol='word_frequecy', size=word_freq_vector_size) #need this for streaming feature_assembler = VectorAssembler(inputCols=[ 'headline_vector', 'score', 'num_comments', 'subjectivity', 'word_frequecy' ], outputCol='features') dt_classifier = DecisionTreeClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', maxDepth=9) pipeline = Pipeline(stages=[ tokenizer, headline2Vector, hashingTF, idf, headline_vector_size_hint, word_freq_vector_size_hint, feature_assembler, dt_classifier ]) sentiment_model = pipeline.fit(training_set) validation_predictions = sentiment_model.transform(validation_set) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label') validation_score = evaluator.evaluate(validation_predictions) print('Validation score for Sentiment model F1: %g' % (validation_score, )) validation_score_accuracy = evaluator.evaluate( validation_predictions, {evaluator.metricName: "accuracy"}) print('Validation score for Sentiment model Accuracy: %g' % (validation_score_accuracy, )) sentiment_model.write().overwrite().save(output_dir)
def base_features_gen_pipeline(input_descript_col="descript", input_category_col="category", output_feature_col="features", output_label_col="label"): tok = Tokenizer(inputCol=input_descript_col, outputCol="items") cv = CountVectorizer(inputCol="items", outputCol=output_feature_col) i = StringIndexer(inputCol=input_category_col, outputCol=output_label_col) return Pipeline(stages=[tok, cv, i])
'hdfs://192.168.50.93:9000/user/hadoop/books2/amazon_reviews_us_Wireless_v1_00.tsv' ) df0.printSchema() #FILTERING FOR EMPTY VALUES df01 = df0.filter((col("review_body").isNotNull()) & (col("verified_purchase").isNotNull())) #ENCODING LABEL stage_string = StringIndexer(inputCol="verified_purchase", outputCol="class_res") ppl = Pipeline(stages=[stage_string]) df1 = ppl.fit(df01).transform(df01) #CREATING TF_IDF tokenizer = Tokenizer(inputCol="review_body", outputCol="words") wordsData = tokenizer.transform(df1) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) #NAIVEBAYES nb = NaiveBayes(featuresCol="features", labelCol="class_res") #Model training model = nb.fit(rescaledData)
timestart = datetime.datetime.now() print("abstracts_full_df2.head() = {}".format(abstracts_full_df2.head())) # Convert the content to Lower Case print("Converting the abstarct to Lower Case ... ") abstracts_full_df3 = abstracts_full_df2.withColumn("abstractNew", lower(col("abstract"))).\ withColumn("abstractNew", regexp_replace("abstractNew", '[^\w-_ ]', "")) abstracts_full_df3.printSchema() # print("abstracts_full_df3.head() = {}".format(abstracts_full_df3.head())) # Tokenize the Abstracts print("tokenizating the abstracts... ") tokenizer = Tokenizer(inputCol="abstractNew", outputCol="words") remover = StopWordsRemover(inputCol="words", outputCol="filtWords") abstracts_full_df4 = tokenizer.transform(abstracts_full_df3) print("After tokenization: ") abstracts_full_df4.printSchema() print("abstracts_full_df4.count() = {}".format(abstracts_full_df4.count())) # print("abstracts_full_df4.head() = {}".format(abstracts_full_df4.head())) # PRINT HOW MUCH TIME IT TOOK TO RUN THE CELL timeend = datetime.datetime.now() timedelta = round((timeend - timestart).total_seconds() / 60, 2) print("Time taken to execute above cell: " + str(timedelta) + " mins") # ### Step 5
# Import the necessary functions from pyspark.sql.functions import regexp_replace from pyspark.ml.feature import Tokenizer # Remove punctuation (REGEX provided) and numbers wrangled = sms.withColumn('text', regexp_replace(sms.text, '[_():;,.!?\\-]', ' ')) wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, '[0-9]', ' ')) # Merge multiple spaces wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +', ' ')) # Split the text into words wrangled = Tokenizer(inputCol='text', outputCol='words').transform(wrangled) wrangled.show(4, truncate=False) ######################################################################################### #Stop words and hashing from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF # Remove stop words. wrangled = StopWordsRemover(inputCol='words', outputCol='terms')\ .transform(sms) # Apply the hashing trick wrangled = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024)\
# Import library from pyspark.ml.feature import HashingTF, IDF, Tokenizer from pyspark.sql import SparkSession, Window import pyspark.sql.functions as F # Create a spark session spark = SparkSession.builder.appName("TfIdf-tokenizing").getOrCreate() # Load and read input documents = spark.read.text("dataset/*.txt") documents = documents.withColumn("doc_id", F.row_number().over(Window.orderBy('value'))) print('The schema associated to the input is') documents.printSchema() # Tokenization of input (splitting the text into individual token/word) tokenizer = Tokenizer(inputCol="value", outputCol="words") wordsData = tokenizer.transform(documents) # Computation of Term-Frequency (TF) associated with the data hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) # Computation of Inverse Document Frequency (IDF) associated with the data idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) # Identifying the TF_IDF associated with the data rescaledData.select("doc_id", "features").show(truncate=False) # Close spark session
def divideAmount(totalAmount, totalSupporter): if(totalSupporter is None or totalAmount is None): return 0 if (totalSupporter == 0 or totalAmount == 0): return 0 price = int(totalAmount / totalSupporter) return price price_udf = udf(dividePrice, IntegerType()) amount_udf = udf(divideAmount, IntegerType()) spark_df = spark_df.withColumn('rangeAmount', price_udf(spark_df['totalAmount'], spark_df['totalSupporter'])) spark_df = spark_df.withColumn('amount', amount_udf(spark_df['totalAmount'], spark_df['totalSupporter'])) spark_df = spark_df.withColumn('soop', spark_df['soop'].cast('string')) spark_df.show() tokenizer = Tokenizer(inputCol='soop', outputCol='keywords') wordData = tokenizer.transform(spark_df) word2Vec = Word2Vec(vectorSize=100, minCount=5, inputCol='keywords', outputCol='word_vec', seed=123) word2VecData = word2Vec.fit(wordData) word2VecData = word2VecData.transform(wordData) # rangeAmount 비교값 추가 all_wadiz_vecs = word2VecData.select('id','word_vec','rangeAmount') import pyspark.sql.functions as F from pyspark.sql.functions import col from pyspark.ml.feature import Normalizer from pyspark.sql.column import Column, _to_java_column, _to_seq def cosinesimilarity_udf(a, b):
outputCol="desc_vec") w2v_model = word2Vec.fit(w2v_df) train_df = w2v_model.transform(w2v_df) train_df.display() # COMMAND ---------- # MAGIC %md ###Step 2: Feature Transformation # MAGIC Use Tokenizer to tokenize text into individual terms. # COMMAND ---------- from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer(inputCol="description", outputCol="desc_terms") tokenized_df = tokenizer.transform(preproc_data) tokenized_df.select("description", "desc_terms").display() # COMMAND ---------- # MAGIC %md **StopWordsRemover** for removing very commonly occuring words which do not carry much meaning. # COMMAND ---------- from pyspark.ml.feature import StopWordsRemover stops_remover = StopWordsRemover(inputCol="desc_terms", outputCol="desc_nostops") stops_df = stops_remover.transform(tokenized_df)
from operator import add from pyspark.sql import functions as F from pyspark.ml import Pipeline from pyspark.sql import SparkSession from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import Word2Vec spark = SparkSession.builder.appName("DataFrame").getOrCreate() sc = spark.sparkContext sc.setLogLevel("ERROR") df = spark.read.json( '/home/khalid/exercise_7_for_first_term_students/tweets.json') tokenizer = Tokenizer(inputCol="content", outputCol="words") wordsData = tokenizer.transform(df) new_words = wordsData.select('words') # Taken from previous exercise word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="words", outputCol="result") model = word2Vec.fit(new_words) result = model.transform(new_words) result.select('result').show() # Show top 1 rows
#training set text_positive = sc.textFile("data/training_positif_clean.csv") text_negative = sc.textFile("data/training_negatif_clean.csv") pos_labels = text_positive.map(lambda x: 1.0).zip( text_positive.map(lambda x: x)) neg_labels = text_negative.map(lambda x: 0.0).zip( text_negative.map(lambda x: x)) pos_df = pos_labels.toDF(["label", "sentence"]) neg_df = neg_labels.toDF(["label", "sentence"]) text_df = pos_df.union(neg_df) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(text_df) #number of words nb_features = 10000 print("\nDone : Tokenization training set") ########################################################################### ######### TF IDF Training Set ######### #training set hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=nb_features) featurizedData = hashingTF.transform(wordsData)
smsDf = sqlContext.createDataFrame(smsXformed, ["label", "message"]) smsDf.cache() smsDf.select("label", "message").show() #Split training and testing (trainingData, testData) = smsDf.randomSplit([0.9, 0.1]) trainingData.count() testData.count() testData.collect() #Setup pipeline from pyspark.ml.classification import NaiveBayes, NaiveBayesModel from pyspark.ml import Pipeline from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.ml.feature import IDF tokenizer = Tokenizer(inputCol="message", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), \ outputCol="tempfeatures") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") nbClassifier = NaiveBayes() pipeline = Pipeline(stages=[tokenizer, hashingTF, \ idf, nbClassifier]) nbModel = pipeline.fit(trainingData) prediction = nbModel.transform(testData) prediction.groupBy("label", "prediction").count().show()
def countIntersection(avec, bvec): count = 0 for x in avec: if x in bvec: count += 1 return count countIntersection_udf = udf(lambda x, y: countIntersection(x, y), IntegerType()) combined_list = nameList.select("pid", concat("name").alias("text")) cleaned_list = combined_list.select("pid", clean_text(col("text")).alias("text")) tokenizer = Tokenizer(inputCol="text", outputCol="vector") vec_text = tokenizer.transform(cleaned_list).select("pid", "vector") stemmed_text = vec_text.withColumn("text_stemmed", stemmer_udf("vector")).select( "pid", "text_stemmed") callengeDF = spark.read.json(hdfs_challengeDir, multiLine=True) challengePlaylist = callengeDF.select(explode("playlists").alias("playlists")) challengeNameList = challengePlaylist.select("playlists.pid", "playlists.name") challenge_combined_list = challengeNameList.select( "pid", concat("name").alias("text")) challenge_cleaned_list = challenge_combined_list.select( "pid", clean_text(col("text")).alias("text")).filter("text is not null")
else: return False ############################## SPARK ML PIPELINE + CALCULATING ACCURACY ################################ after_process = sqlContext.read.parquet( "hdfs:///user/prado/little_text_process1.parquet") mlinterest = after_process.na.drop(subset=["sentiment"]) sent_value = udf(sentiment_values, IntegerType()) MLinterest = mlinterest.withColumn('label', sent_value(mlinterest.sentiment)) MLINTEREST = MLinterest.filter(MLinterest.lang == "en") MLINTEREST1 = MLINTEREST.withColumn("label", MLINTEREST.label.cast(DoubleType())) # Create pipeline tokenizer = Tokenizer(inputCol="main", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") idf = IDF(inputCol="rawFeatures", outputCol="features") nb = NaiveBayes() pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, nb]) # Separate train/test train, test = MLINTEREST1.randomSplit([0.6, 0.4], 24) train.cache() # Train our model model = pipeline.fit(train) predictionAndLabels = model.transform( test.withColumnRenamed('label', 'true_label')) wesh = predictionAndLabels.select('prediction', 'true_label').rdd
def main(): spark = SparkSession.builder \ .appName("Spark CV-job ad matching") \ .config("spark.some.config.option", "some-value") \ .master("local[*]") \ .getOrCreate() NUM_FEATURES = 2**8 df_jobs = spark.read.json("alljobs4rdd/alljobs.jsonl").filter("description is not NULL").cache() df_jobs.registerTempTable("jobs") df_cvs = spark.read.json("allcvs4rdd/allcvs.jsonl").cache() df_cvs.registerTempTable("cvs") df_categories = spark.read.json("allcategories4rdd/allcategories.jsonl").cache() df_categories.registerTempTable("categories") joined = spark.sql("SELECT description AS text, jobId AS id, 'job' AS type FROM jobs UNION ALL \ SELECT description AS text, cvid AS id, 'cv' AS type FROM cvs UNION ALL \ SELECT skillText AS text, id AS id, 'categories' AS type FROM categories") tokenizer = Tokenizer(inputCol="text", outputCol="words") tokenized = tokenizer.transform(joined) remover = StopWordsRemover(inputCol="words", outputCol="filtered") removed = remover.transform(tokenized) hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=NUM_FEATURES) featurizedData = hashingTF.transform(removed) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.registerTempTable("resultTable") jobs = spark.sql("SELECT features, id AS jobId FROM resultTable WHERE type = 'job'") cvs = spark.sql("SELECT features AS featuresCV, id AS cvid FROM resultTable WHERE type = 'cv'") categories = spark.sql("SELECT features AS featuresCAT, cat.id, cat.skillName AS skillName, category FROM resultTable AS rt\ LEFT JOIN categories AS cat ON rt.id = cat.id WHERE type = 'categories'") #Calculate job-cv similarity START crossJoined = jobs.select("jobId", "features").crossJoin(cvs.select("cvid", "featuresCV")) calculatedDF = crossJoined.rdd.map(lambda x: (x.jobId, x.cvid, calculate_distance(x.features, x.featuresCV)))\ .toDF(["jobid", "cvid", "distance"]) ordered = calculatedDF.orderBy(asc("jobid")).coalesce(2) ordered.write.csv('Calculated/tfidf/job-cv') #Calculate job-cv similarity END #Calculate cv-category similarity START crossJoined_cat_cv = cvs.select("cvid", "featuresCV").crossJoin(categories.select("id", "skillName", "featuresCAT", "category")) calculatedDF_cat_cv = crossJoined_cat_cv.rdd\ .map(lambda x: (x.cvid, x.id, x.skillName, x.category, calculate_distance(x.featuresCV, x.featuresCAT)))\ .toDF(["cvid", "catid", "skillName", "category", "distance"]) ordered_cat_cv = calculatedDF_cat_cv.orderBy(asc("cvid"), asc("distance")).coalesce(2) ordered_cat_cv.write.csv('Calculated/tfidf/cv-category') #Calculate cv-category similarity END #Job-category START crossJoined_job_cat = jobs.select("jobId", "features").crossJoin(categories.select("id", "skillName", "featuresCAT", "category")) calculatedDF_job_cat = crossJoined_job_cat.rdd\ .map(lambda x: (x.jobId, x.id, x.skillName, x.category, calculate_distance(x.features, x.featuresCAT)))\ .toDF(["jobid", "catid", "skillName", "category", "distance"]) ordered_job_cat = calculatedDF_job_cat.orderBy( asc("distance")).coalesce(2) ordered_job_cat.write.csv('Calculated/tfidf/job-category')
plt.tight_layout(pad=0) plt.show() # In[102]: ham_words = ' '.join(list(df[df['label'] == 0]['message'])) ham_wc = WordCloud(width=512, height=512).generate(ham_words) plt.figure(figsize=(10, 8), facecolor='k') plt.imshow(ham_wc) plt.axis('off') plt.tight_layout(pad=0) plt.show() # In[44]: tokenizer = Tokenizer(inputCol="message", outputCol="tokenized") # In[45]: hasher = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="frequency") # In[46]: idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features") # In[47]: from pyspark.ml.classification import RandomForestClassifier # In[48]:
# Construct a pipeline pipeline = Pipeline(stages=[indexer, onehot, assembler, regression]) # Train the pipeline on the training data pipeline = pipeline.fit(flights_train) # Make predictions on the testing data predictions = pipeline.transform(flights_test) -------------------------------------------------- # Exercise_3 from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF # Break text into tokens at non-word characters tokenizer = Tokenizer(inputCol='text', outputCol='words') # Remove stop words remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms') # Apply the hashing trick and transform to TF-IDF hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash") idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features") # Create a logistic regression object and add everything to a pipeline logistic = LogisticRegression() pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic]) -------------------------------------------------- # Exercise_4 # Create an empty parameter grid
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer, StopWordsRemover from pyspark.ml.clustering import KMeans #Check if all the params were passed if (len(sys.argv) > 5): #Setup the sparkContext sc = SparkContext(appName="SparkClustering-emonto15-dperezg1") spark = SparkSession(sc) #Read from hdfs and save using a schema (path,text) files = sc.wholeTextFiles("hdfs://" + sys.argv[1]) schema = StructType([ StructField("path", StringType(), True), StructField("text", StringType(), True) ]) df = spark.createDataFrame(files, schema) #Divide the text into an array of words tokenizer = Tokenizer(inputCol="text", outputCol="tokens") #Setup the language to remove the stopwords StopWordsRemover.loadDefaultStopWords(sys.argv[4]) #Read from column tokens (which is the output of the tokenizer object) and save a new array of words without the stopwords stopWords = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens") #Creates a hash of each word and the frecuency on each document and only takes the number of words established on the numFeatures parameter hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=int(sys.argv[3])) #Calculates the inverse document frecuency, and ignore a word if well explained on the code's article idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=1) #Initialize the kmeans with a specific K kmeans = KMeans(k=int(sys.argv[2])) #Declare the assambly line to transform the dataset #creacion del mapa de transformaciones
def main(): spark = SparkSession.builder.appName('AmazonReviewsSparkProcessor').getOrCreate() # Convert command line args into a map of args args_iter = iter(sys.argv[1:]) args = dict(zip(args_iter, args_iter)) # Retrieve the args and replace 's3://' with 's3a://' (used by Spark) s3_input_data = args['s3_input_data'].replace('s3://', 's3a://') print(s3_input_data) s3_output_data = args['s3_output_data'].replace('s3://', 's3a://') print(s3_output_data) schema = StructType([ StructField('is_positive_sentiment', IntegerType(), True), StructField('marketplace', StringType(), True), StructField('customer_id', StringType(), True), StructField('review_id', StringType(), True), StructField('product_id', StringType(), True), StructField('product_parent', StringType(), True), StructField('product_title', StringType(), True), StructField('product_category', StringType(), True), StructField('star_rating', IntegerType(), True), StructField('helpful_votes', IntegerType(), True), StructField('total_votes', IntegerType(), True), StructField('vine', StringType(), True), StructField('verified_purchase', StringType(), True), StructField('review_headline', StringType(), True), StructField('review_body', StringType(), True), StructField('review_date', StringType(), True) ]) df_csv = spark.read.csv(path=s3_input_data, schema=schema, header=True, quote=None) df_csv.show() # This dataset should already be clean, but always good to double-check print('Showing null review_body rows...') df_csv.where(col('review_body').isNull()).show() df_csv_cleaned = df_csv.na.drop(subset=['review_body']) df_csv_cleaned.where(col('review_body').isNull()).show() tokenizer = Tokenizer(inputCol='review_body', outputCol='words') wordsData = tokenizer.transform(df_csv_cleaned) hashingTF = HashingTF(inputCol='words', outputCol='raw_features', numFeatures=1000) featurizedData = hashingTF.transform(wordsData) # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: # 1) compute the IDF vector # 2) scale the term frequencies by IDF # Therefore, we cache the result of the HashingTF transformation above to speed up the 2nd pass featurizedData.cache() # spark.mllib's IDF implementation provides an option for ignoring terms # which occur in less than a minimum number of documents. # In such cases, the IDF for these terms is set to 0. # This feature can be used by passing the minDocFreq value to the IDF constructor. idf = IDF(inputCol='raw_features', outputCol='features') #, minDocFreq=2) idfModel = idf.fit(featurizedData) features_df = idfModel.transform(featurizedData) features_df.select('is_positive_sentiment', 'features').show() # TODO: Use SVD instead # features_vector_rdd = features_df.select('features').rdd.map( lambda row: Vectors.fromML(row.getAs[MLVector]('features') ) # features_vector_rdd.cache() # mat = RowMatrix(features_vector_rdd) # k = 300 # svd = mat.computeSVD(k, computeU=True) # TODO: Reconstruct num_features=300 pca = PCA(k=num_features, inputCol='features', outputCol='pca_features') pca_model = pca.fit(features_df) pca_features_df = pca_model.transform(features_df).select('is_positive_sentiment', 'pca_features') pca_features_df.show(truncate=False) standard_scaler = StandardScaler(inputCol='pca_features', outputCol='scaled_pca_features') standard_scaler_model = standard_scaler.fit(pca_features_df) standard_scaler_features_df = standard_scaler_model.transform(pca_features_df).select('is_positive_sentiment', 'scaled_pca_features') standard_scaler_features_df.show(truncate=False) expanded_features_df = (standard_scaler_features_df.withColumn('f', to_array(col('scaled_pca_features'))) .select(['is_positive_sentiment'] + [col('f')[i] for i in range(num_features)])) expanded_features_df.show() # Remover overwrite to test for this issue # https://stackoverflow.com/questions/51050591/spark-throws-java-io-ioexception-failed-to-rename-when-saving-part-xxxxx-gz expanded_features_df.write.csv(path=s3_output_data, header=None, quote=None) #, # mode='overwrite') print('Wrote to output file: {}'.format(s3_output_data))
when(col("product_description").isNull(), "empty").otherwise(col("product_description"))) # combine relevant fields (product_title, attr_list, product_description, search_term) as one named `info` train = train.select( "product_uid1", "search_term", concat(col("product_title"), lit(' '), col("attr_list"), lit(' '), col("product_description"), lit(' '), col("search_term")).alias("info")).orderBy("product_uid1") train.show() # train.printSchema() # train.write.option("header", 'true').csv(path + 'saved_file.csv') from pyspark.ml.feature import StopWordsRemover, Tokenizer, Word2Vec # tokenize `info` and remove stopwords in unioned data tokenizer = Tokenizer(inputCol="info", outputCol="tokenized_info") tokenized = tokenizer.transform(train).drop("info") tokenized.show() # remover = StopWordsRemover(inputCol="tokenized_info", outputCol="filtered_info") # removed = remover.transform(tokenized).drop("tokenized_info").na.drop() # print removed.count() # use union tokenized word2Vec = Word2Vec(vectorSize=3, minCount=5, inputCol="tokenized_info", outputCol="vec") model = word2Vec.fit(tokenized) # model.getVectors().show() # tokenized search terms in train.csv term_tokenizer = Tokenizer(inputCol="search_term", outputCol="term_token")
if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("PipelineExample")\ .getOrCreate() # $example on$ # Prepare training documents from a list of (id, text, label) tuples. training = spark.createDataFrame([(0L, "a b c d e spark", 1.0), (1L, "b d", 0.0), (2L, "spark f g h", 1.0), (3L, "hadoop mapreduce", 0.0)], ["id", "text", "label"]) # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.01) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training documents. model = pipeline.fit(training) # Prepare test documents, which are unlabeled (id, text) tuples. test = spark.createDataFrame([(4L, "spark i j k"), (5L, "l m n"), (6L, "mapreduce spark"), (7L, "apache hadoop")], ["id", "text"]) # Make predictions on test documents and print columns of interest. prediction = model.transform(test)
'SentimentSource', ] train = train.select( [column for column in train.columns if column not in drop_list]) train = train.withColumn("label", train["label"].cast("TINYINT")) #removes null rows train = train.na.drop() #Load test data test = spark.read.format("csv").option("header", "true").option("delimiter", ";").load("cloud.csv") #tokenizer trainTokenizer = Tokenizer(inputCol="SentimentText;;;;;;;;;;;;;;;;;;;;;;;;", outputCol="words") trainCountTokens = udf(lambda words: len(words), IntegerType()) train = trainTokenizer.transform(train) testTokenizer = Tokenizer(inputCol="text", outputCol="words") testCountTokens = udf(lambda words: len(words), IntegerType()) test = testTokenizer.transform(test) #Remove stopwords trainRemover = StopWordsRemover(inputCol="words", outputCol="filtered") train = trainRemover.transform(train) testRemover = StopWordsRemover(inputCol="words", outputCol="filtered") test = testRemover.transform(test) # fit CountVectorizerModel