def tokenize(p_df, in_column, out_column): """ Tokenizes a column in a DataFrame. :param p_df: A DataFrame. :param in_column: Name of the input column. :param out_column: Name of the output column. :return: A DataFrame. """ tokenizer = RegexTokenizer(inputCol=in_column, outputCol=out_column, pattern="\\W") return tokenizer.transform(p_df)
def tokenize(df, column): """ Tokenize alpha-numeric words. Set all tokens to lower-case and remove short terms having less than 3 characters. """ # creates tokenizer based on regular expressions wordTokenizer = RegexTokenizer( inputCol=column, outputCol='_'+column, pattern='\w+' ).setGaps(False) # match tokens rather than gaps # transform: string --> array<string> df = wordTokenizer.transform(df) df = replace(df, column, '_'+column) return df
def main(): spark = SQLContext(SparkContext.getOrCreate()) # read data yahoo = spark.read.csv(f'{BUILDDIR}/yahoo.csv', header=True) data = yahoo.select(['sector', 'description']).dropna() # tokenize texts based on regular expression tokenize = RegexTokenizer(inputCol='description', outputCol='words_all', pattern=r'\W') # remove stop words stopwords = '\n'.join((DATADIR / 'stopwords' / f).read_text().strip() for f in ('mysql.txt', 'nltk.txt')).splitlines() remove_stopwords = StopWordsRemover( inputCol='words_all', outputCol='words_clean').setStopWords(stopwords) # get words frequency using simple count (bag of words) add_wordcount = CountVectorizer(inputCol='words_clean', outputCol='words_count', vocabSize=1000, minDF=2) # get tf-idf words frequencies add_wordtf = HashingTF(inputCol='words_clean', outputCol='words_tf', numFeatures=10000) add_wordidf = IDF(inputCol='words_tf', outputCol='words_tfidf', minDocFreq=2) # prepare output values index_target = StringIndexer(inputCol='sector', outputCol='label') # data preparation pipeline pipeline_wordcount = Pipeline(stages=[ tokenize, remove_stopwords, add_wordcount, add_wordtf, add_wordidf, index_target, ]) # apply data preparation pipeline model_wordcount = pipeline_wordcount.fit(data) prepared = model_wordcount.transform(data) breakpoint() # split to training and testing training, testing = prepared.randomSplit([0.8, 0.2], seed=100500) # fit logistic regression models logistic_wordcount = LogisticRegression(regParam=0.3, elasticNetParam=0, featuresCol='words_count', labelCol='label', predictionCol='prediction', probabilityCol='probability') logistic_tfidf = LogisticRegression(regParam=0.3, elasticNetParam=0, featuresCol='words_tfidf', labelCol='label', predictionCol='prediction', probabilityCol='probability') evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', metricName='accuracy') for model, name in ((logistic_wordcount, 'Word count + Logistic regression'), (logistic_tfidf, 'TF-IDF + Logistic regression')): predicted = model.fit(training).transform(testing) print(f'{name} model accuracy = {evaluator.evaluate(predicted)}')
.orderBy(col("count").desc()) \ .show() data.groupBy("SentimentText") \ .count() \ .orderBy(col("count").desc()) \ .show() # set seed for reproducibility (trainingData, testData) = data.randomSplit([0.7, 0.3], seed=100) print("Training Dataset Count: " + str(trainingData.count())) print("Test Dataset Count: " + str(testData.count())) # regular expression tokenizer regexTokenizer = RegexTokenizer(inputCol="SentimentText", outputCol="words", pattern="\\W") # stop words add_stopwords = ["http", "https", "amp", "rt", "t", "c", "the"] stopwordsRemover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(add_stopwords) # bag of words count countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) # convert string labels to indexes label_stringIdx = StringIndexer(inputCol="Sentiment", outputCol="label")
# MAGIC %md # MAGIC Split the Wikipedia text into sentences. # COMMAND ---------- pattern = r"(\. |\n{2,})" import re matches = re.findall(pattern, "Wiki page. *More information*\n\n And a line\n that continues.") print matches # COMMAND ---------- from pyspark.ml.feature import RegexTokenizer tokenizer = RegexTokenizer(inputCol="text", outputCol="sentences", pattern=pattern) sentences = tokenizer.transform(parsed).select("sentences") display(sentences) # COMMAND ---------- from pyspark.sql import Row from pyspark.sql.types import StructType, StructField, StringType sentenceRDD = sentences.flatMap(lambda r: r[0]).map(lambda x: Row(sentence=x)) sentenceSchema = StructType([StructField("sentence", StringType())]) sentence = sqlContext.createDataFrame(sentenceRDD, sentenceSchema) display(sentence)
train_df.show() mapping_df.show() df1=train_df.select("genre") #train_df.column("genre") df1.show(5) train_df.printSchema() from pyspark.ml.feature import Tokenizer, RegexTokenizer #plotToken = Tokenizer(inputCol="plot", outputCol="splitWords") #plotToken.transform(dataset).head() #dataset.na.drop(subset=["plot"]) regexToeknizer = RegexTokenizer(inputCol="plot", outputCol="tokens", pattern="\\W") #dataset = regexToeknizer.transform(train_df) #dataset.printSchema() #dataset.select("plot").show(5) from pyspark.ml.feature import StopWordsRemover remover= StopWordsRemover(inputCol="tokens", outputCol="stopRemove") #dataset=remover.transform(dataset) #dataset.printSchema() #dataset.show(2) from pyspark.ml.feature import CountVectorizer,HashingTF from pyspark.sql.functions import when, col, coalesce, array from pyspark.ml import Pipeline #fillNull = array().cast("array<string>") #solution for null handel from stackOverflow
def load_data(manifest, base='gs', kind='bytes'): '''Load data from a manifest file into a DataFrame. A manifest file gives the hash identifying each document on separate lines. The returned DataFrame has columns `id`, `url`, and `text` where `id` is a document identifier, `url` is the path to the document, and `text` is the contents. Note that the document ID is _not_ the same as the hash. The ID is guaranteed to uniquely identify one document and the order of the IDs is guaranteed to match the order given in the manifest file. Args: manifest (path): Path or URL of the manifest file. base (path): The base of the URL or path to the data. The special strings 'gs' and 'https' expand to the URLs used by Data Science Practicum at UGA over the Google Storage and HTTPS protocols respectivly. kind (str): The kind of file to use, one of 'bytes' or 'asm'. - 'bytes' loads hex strings for the bytes in the binary files. - 'asm' loads segment titles and the opcodes from the asm files. Returns: DataFrame[id: bigint, url: string, text: string] ''' spark = elizabeth.session() ctx = spark.sparkContext # Special base paths if base == 'https': base = 'https://storage.googleapis.com/uga-dsp/project2/data' if base == 'gs': base = 'gs://uga-dsp/project2/data' kind = 'bytes' if kind != 'bytes': kind='asm' # Read the manifest as an iterator over (id, url). # We use Spark to build the iterator to support hdfs etc. manifest = str(manifest) # cast to str to support pathlib.Path etc. manifest = ctx.textFile(manifest) # RDD[hash] manifest = manifest.map(hash_to_url(base=base, kind=kind)) # RDD[url] manifest = manifest.zipWithIndex() # RDD[url, id] manifest = manifest.map(lambda x: (x[1], x[0])) # RDD[id, url] manifest = manifest.toLocalIterator() # (id, url) # Load all files in the base directoy, then join out the ones in the manifest. prepend = lambda *args: lambda x: (*args, *x) data = ((id, ctx.wholeTextFiles(url)) for id, url in manifest) # (id, RDD[url, text]) data = [rdd.map(prepend(id)) for id, rdd in data] # [RDD[id, url, text]] data = ctx.union(data) # RDD[id, url, text] data = data.toDF(['id', 'url', 'text']) # DF[id, url, text] # Tokenization : DF[id, url, text, tokens] tokenizer = RegexTokenizer(inputCol='text', outputCol='features', gaps=False) opcodes = '|'.join(_opcodes) if kind == 'bytes': tokenizer.setPattern('(?<= )[0-9A-F]{2}') elif kind == 'asm': tokenizer.setPattern(r'(\.?\w+:(?=[0-9A-F]{8}\s))|(\b(' + opcodes + r')\b)') data = tokenizer.transform(data) data = data.drop('text') return data
def main(argv=None): if argv is None: inputs_train = sys.argv[1] inputs_test = sys.argv[2] conf = SparkConf().setAppName('sentiment-analysis-word2vec-cluster') sc = SparkContext(conf=conf) sqlCt = SQLContext(sc) #read train json file and prepare data (label, feature) text = sqlCt.read.json(inputs_train) train = text.select('overall', 'reviewText').withColumnRenamed('overall', 'label') train.cache() ## DATA PROCESSING PIPELINE # Split at whitespace and characters that are not letter tokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\P{Alpha}+") # stopword remover remover = StopWordsRemover(inputCol="words", outputCol="filtered_words") pipeline_data_processing = Pipeline(stages=[tokenizer, remover]) model_data_processing = pipeline_data_processing.fit(train) train_processed = model_data_processing.transform(train) train.unpersist() train_processed.cache() ## INTERMEDIATE STEP TO GET WORD VOCABULARY AND VECTOR # word2vec word2Vec = Word2Vec(inputCol="filtered_words", outputCol="word2vec_features") model_word2Vec = word2Vec.fit(train_processed) # Dataframe dictionary of Word-vectors vocabulary = model_word2Vec.getVectors() vocabulary.cache() ## ML PIPELINE # WordCluster Features wordcluster = WordCluster(inputCol="filtered_words", predictionCol="cluster", \ k=3, vocabulary=vocabulary) # get vector of cluster frequency for each document count_vectorizer = CountVectorizer(inputCol="cluster", outputCol="count") # normalized cluster frequency vector for each document normalizer = Normalizer(inputCol="count", outputCol="features", p=1.0) # linear Regression Model lr = LinearRegression(maxIter=20, regParam=0.1) # Final Pipeline pipeline = Pipeline(stages=[wordcluster, count_vectorizer, normalizer, lr]) ## FIT MODEL USING CROSS VALIDATION # Parameter grid for cross validation: numFeatures and regParam paramGrid = ParamGridBuilder() \ .addGrid(wordcluster.k, [1000, 5000, 10000, 20000]) \ .addGrid(lr.regParam, [0.001, 0.01, 0.1, 1.0]) \ .build() # 5-fold cross validation evaluator = RegressionEvaluator(metricName="rmse") crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) # Run cross-validation, and choose the best set of parameters. model = crossval.fit(train_processed) # RMSE on train data prediction_train = model.transform(train_processed) rmse_train = evaluator.evaluate(prediction_train) train_processed.unpersist() vocabulary.unpersist() ## TEST DATA #read test json file and process data (label, feature) text = sqlCt.read.json(inputs_test) test = text.select('overall', 'reviewText').withColumnRenamed('overall', 'label') test_processed = model_data_processing.transform(test) # Evaluate the model on test data prediction_test = model.transform(test_processed) rmse_test = evaluator.evaluate(prediction_test) # Print Result result = "MODEL WITH Word Clustering features - best k = " \ + str(model.bestModel.stages[0].getK()) + ":\n" result = result + "-Train RMSE: " + str(rmse_train) + "\n" result = result + "-Test RMSE: " + str(rmse_test) + "\n" print(result)
sc = SparkContext(appName="Tweet") spark = SparkSession(sc) sc.setLogLevel("WARN") # Elastic Search conf = SparkConf(loadDefaults=False) conf.set("es.index.auto.create", "true") # read the dataset training_set = spark.read.csv('../tap/spark/dataset/training_set_sentipolc16.csv', schema=schema, header=True, sep=',') # define stage 1: tokenize the tweet text stage_1 = RegexTokenizer(inputCol= 'tweet' , outputCol= 'tokens', pattern= '\\W') # define stage 2: remove the stop words stage_2 = StopWordsRemover(inputCol= 'tokens', outputCol= 'filtered_words') # define stage 3: create a word vector of the size 100 stage_3 = Word2Vec(inputCol= 'filtered_words', outputCol= 'vector', vectorSize= 100) # define stage 4: Logistic Regression Model model = LogisticRegression(featuresCol= 'vector', labelCol= 'positive') # setup the pipeline pipeline = Pipeline(stages= [stage_1, stage_2, stage_3, model]) # fit the pipeline model with the training data pipelineFit = pipeline.fit(training_set) modelSummary=pipelineFit.stages[-1].summary modelSummary.accuracy
from pyspark.sql import SparkSession from pyspark.sql import SQLContext from pyspark.ml import Pipeline from kafka import KafkaConsumer from pyspark.ml.evaluation import MulticlassClassificationEvaluator #-------------------Building the logistic regression and naive bayes pipelines---------------------------------- if __name__ == "__main__": sc = SparkContext.getOrCreate() sc.setLogLevel("ERROR") sqlContext = SQLContext(sc) regex_tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") stop_words = [] with open('/home/asdf/Documents/stopwords.txt', 'r') as contents: stop_words = contents.read().split() stop_words_remover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(stop_words) count_vectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) lr = LogisticRegression(maxIter=100, regParam=0.01)
from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("TokenizerExample")\ .getOrCreate() # $example on$ sentenceDataFrame = spark.createDataFrame([ (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") ], ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W") # alternatively, pattern="\\w+", gaps(False) tokenized = tokenizer.transform(sentenceDataFrame) for words_label in tokenized.select("words", "label").take(3): print(words_label) regexTokenized = regexTokenizer.transform(sentenceDataFrame) for words_label in regexTokenized.select("words", "label").take(3): print(words_label) # $example off$ spark.stop()
else: return "../examples/smalldata/" + file_name ## This method loads the data, perform some basic filtering and create Spark's dataframe def load(): row_rdd = spark.sparkContext.textFile(_locate("smsData.txt")).map(lambda x: x.split("\t", 1)).filter(lambda r: r[0].strip()) return spark.createDataFrame(row_rdd, ["label", "text"]) ## ## Define the pipeline stages ## ## Tokenize the messages tokenizer = RegexTokenizer(inputCol="text", outputCol="words", minTokenLength=3, gaps=False, pattern="[a-zA-Z]+") ## Remove ignored words stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model
# COMMAND ---------- from pyspark.ml.feature import Tokenizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn.transform(sales.select("Description")) tokenized.show(20, False) # COMMAND ---------- from pyspark.ml.feature import RegexTokenizer rt = RegexTokenizer()\ .setInputCol("Description")\ .setOutputCol("DescOut")\ .setPattern(" ")\ .setToLowercase(True) rt.transform(sales.select("Description")).show(20, False) # COMMAND ---------- from pyspark.ml.feature import RegexTokenizer rt = RegexTokenizer()\ .setInputCol("Description")\ .setOutputCol("DescOut")\ .setPattern(" ")\ .setGaps(False)\ .setToLowercase(True) rt.transform(sales.select("Description")).show(20, False)
import pandas as pd from pyspark.sql import SQLContext from pyspark.ml.feature import RegexTokenizer, HashingTF from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import RandomForest ## Load Dataset df_pandas = pd.read_csv('sample.csv') ## Convert to Spark Dataframe sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(df_pandas) ## Tokenizer and Hashing tokenizer = RegexTokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(numFeatures=10000, inputCol="words", outputCol="features") df_feat = hashingTF.transform(tokenizer.transform(df)) ## Create LabeledPoint and Features for Prediction (predict the 1s observations) lp = df_feat.map(lambda x: LabeledPoint(x.label, x.features)) predict_feat = df_feat.where(df_feat.label == 1).map(lambda x: x.features) ## Compare predictions from Different Models ## Logistic Regression lrm = LogisticRegressionWithSGD.train(lp, iterations=10) logit_predict = lrm.predict(predict_feat) logit_predict.sum()
test_df = test_df.withColumnRenamed('_2', 'id') test_df = test_df.select('id', 'category', 'text') writeToFile("\nTask 1.1 (b)\n") writeToFile("First 5 rows of 'INDEXED' test set \n\n") k = test_df.take(5) for i, row in enumerate(k): row_name = 'Row-' + str(i) writeToFile(row_name + '\n') writeToFile(str(row[0]) + ', ' + str(row[1]) + ', ' + str(row[2]) + '\n\n') ######################################################################################################## # Build pipeline and run indexer = StringIndexer(inputCol="category", outputCol="label") tokenizer = RegexTokenizer(pattern=u'\W+', inputCol="text", outputCol="words", toLowercase=False) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") idf = IDF(inputCol="rawFeatures", outputCol="features") lr = LogisticRegression(maxIter=20, regParam=0.001) # Builing model pipeline pipeline = Pipeline(stages=[indexer, tokenizer, hashingTF, idf, lr]) # Train model on training set model = pipeline.fit( train_df ) #if you give new names to your indexed datasets, make sure to make adjustments here # Model prediction on test set pred = model.transform(test_df) # ...and here
data.groupBy("text") \ .count() \ .orderBy(col("count").desc()) \ .show() # set seed for reproducibility (trainingData, testData) = data.randomSplit([0.7, 0.3], seed=100) print("Training Dataset Count: " + str(trainingData.count())) print("Test Dataset Count: " + str(testData.count())) trainingData.printSchema() trainingData.show(5) testData.show(5) # regular expression tokenizer regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words") # stop words add_stopwords = ["http", "https", "amp", "rt", "t", "c", "the", "RT"] stopwordsRemover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(add_stopwords) # bag of words count countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) # convert string labels to indexes label_stringIdx = StringIndexer(inputCol="airline_sentiment", outputCol="label")
from pyspark.ml.feature import (RegexTokenizer, StopWordsRemover, Word2Vec) _regex_tokenizer = RegexTokenizer(inputCol='tweet', outputCol='tokens', pattern='\\W') _stop_word_remover = StopWordsRemover(inputCol='tokens', outputCol='filtered_words') _word_2_vec = Word2Vec( inputCol='filtered_words', outputCol='vector', vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, ) TRANSFORMERS = [_regex_tokenizer, _stop_word_remover, _word_2_vec]
# MAGIC %md # MAGIC Split the Wikipedia text into sentences. # COMMAND ---------- pattern = r'(\. |\n{2,})' import re matches = re.findall(pattern, 'Wiki page. *More information*\n\n And a line\n that continues.') print matches # COMMAND ---------- from pyspark.ml.feature import RegexTokenizer tokenizer = RegexTokenizer(inputCol='text', outputCol='sentences', pattern=pattern) sentences = tokenizer.transform(parsed).select('sentences') display(sentences) # COMMAND ---------- from pyspark.sql import Row from pyspark.sql.types import StructType, StructField, StringType sentenceRDD = (sentences .flatMap(lambda r: r[0]) .map(lambda x: Row(sentence=x))) sentenceSchema = StructType([StructField('sentence', StringType())]) sentence = sqlContext.createDataFrame(sentenceRDD, sentenceSchema)
if __name__ == "__main__": time1 = datetime.datetime.now() spark = SparkSession\ .builder\ .appName("news")\ .config("spark.some.config.option", "some-value")\ .getOrCreate() bucket_path = sys.argv[1] news_data = spark.read.csv(bucket_path, header='True', inferSchema='True') title_category = news_data.select("TITLE", "CATEGORY") title_category = title_category.dropna() title_category = title_category.withColumn( "only_str", regexp_replace(col('TITLE'), '\d+', '')) regex_tokenizer = RegexTokenizer(inputCol="only_str", outputCol="words", pattern="\\W") raw_words = regex_tokenizer.transform(title_category) remover = StopWordsRemover(inputCol="words", outputCol="filtered") words_df = remover.transform(raw_words) indexer = StringIndexer(inputCol="CATEGORY", outputCol="categoryIndex") feature_data = indexer.fit(words_df).transform(words_df) cv = CountVectorizer(inputCol="filtered", outputCol="features") model = cv.fit(feature_data) countVectorizer_feateures = model.transform(feature_data) (trainingData, testData) = countVectorizer_feateures.randomSplit([0.8, 0.2], seed=11) nb = NaiveBayes(modelType="multinomial", labelCol="categoryIndex", featuresCol="features") nbModel = nb.fit(trainingData)
# by top 20 categories data.groupBy("Category") \ .count() \ .orderBy(col("count").desc()) \ .show() # by top 20 descriptions data.groupBy("Descript") \ .count() \ .orderBy(col("count").desc()) \ .show() # regular expression tokenizer regexTokenizer = RegexTokenizer(inputCol="Descript", outputCol="words", pattern="\\W") # stop words add_stopwords = ["http","https","amp","rt","t","c","the"] # standard stop words stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords) # bag of words count countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label") transformers=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx] pipeline = Pipeline(stages=transformers)
(when(col("comment").like("%my dog%"), 1) \ .when(col("comment").like("%I have a dog%"), 1) \ .when(col("comment").like("%my cat%"), 1) \ .when(col("comment").like("%I have a cat%"), 1) \ .when(col("comment").like("%my puppy%"), 1) \ .when(col("comment").like("%my pup%"), 1) \ .when(col("comment").like("%my kitty%"), 1) \ .when(col("comment").like("%my pussy%"), 1) \ .otherwise(0))) df_clean.show() # 1. Data preprocesing and build the classifier from pyspark.ml.feature import RegexTokenizer, Word2Vec from pyspark.ml.classification import LogisticRegression # regular expression tokenizer regexTokenizer = RegexTokenizer(inputCol="comment", outputCol="words", pattern="\\W") word2Vec = Word2Vec(inputCol="words", outputCol="features") from pyspark.ml import Pipeline pipeline = Pipeline(stages=[regexTokenizer, word2Vec]) # Fit the pipeline to training documents. pipelineFit = pipeline.fit(df_clean) dataset = pipelineFit.transform(df_clean) dataset.show() # Remove the emtpy features caused by none English statements. from pyspark.sql.functions import col from pyspark.sql.types import BooleanType from pyspark.sql.functions import udf
from pyspark.ml.feature import RegexTokenizer from pyspark.sql import Row # In[58]: # TODO - Change this directory to the right location where the data is stored dataDir = "/Users/RajT/Downloads/20_newsgroups/*" # Read the entire text into a DataFrame textRDD = sc.wholeTextFiles(dataDir).map(lambda recs: Row(sentence=recs[1])) textDF = spark.createDataFrame(textRDD) # In[59]: # Tokenize the sentences to words regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", gaps=False, pattern="\\w+") tokenizedDF = regexTokenizer.transform(textDF) # In[60]: # Prepare the Estimator # It sets the vector size, and the parameter minCount sets the minimum number of times a token must appear to be included in the word2vec model's vocabulary. word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="words", outputCol="result") # Train the model model = word2Vec.fit(tokenizedDF) # In[61]:
df_clean.show() # COMMAND ---------- # MAGIC %md # MAGIC #### 1. Data preprocessing and Build the classifier # MAGIC To train a model against comments, we use RegexTokenizer to split each comment into a list of words and then use Word2Vec to convert the list to a word vector. Word2Vec map each word to a unique fixed-size vector and then transform each document into a vector using the average of all words in the document. # COMMAND ---------- # data preprocessing from pyspark.ml.feature import RegexTokenizer regexTokenizer = RegexTokenizer(inputCol="comment", outputCol="text", pattern="\\W") df_clean = regexTokenizer.transform(df_clean) df_clean.show(10) # COMMAND ---------- # MAGIC %md # MAGIC ##### Alert: First try is to use 1,000,000 rows for testing # COMMAND ---------- from pyspark.sql.functions import rand df_clean.orderBy(rand(seed=0)).createOrReplaceTempView("table1") df_clean = spark.sql("select * from table1 limit 1000000")
# Generate ensemble with Random Forest. Accuracy ~ 0.203 if __name__ == "__main__": spark = getSparkSession() passengers = readPassengersWithCastingToDoubles(spark).select( "survived", "pclass", "sibsp", "parch", "sex", "embarked", "age", "fare", "name") training, test = passengers.randomSplit([0.7, 0.3], seed=12345) training.cache() test.cache() regexTokenizer = RegexTokenizer(gaps=False, pattern="\\w+", inputCol="name", outputCol="name_parts", toLowercase=True) stopWords = ["mr", "mrs", "miss", "master", "jr", "j", "c", "d"] remover = StopWordsRemover(inputCol="name_parts", outputCol="filtered_name_parts", stopWords=stopWords) hashingTF = HashingTF(numFeatures=1000, inputCol="filtered_name_parts", outputCol="text_features") sexIndexer = StringIndexer(inputCol="sex", outputCol="sexIndexed",
get_topic_0, StructType([ StructField("cluster_id", IntegerType()), StructField("score", FloatType()), StructField("value", StringType()) ])) #newDF = df.withColumn("title", udf_get_title(df.marc)).withColumn("marc_subjects", udf_get_subjects(df.marc)) newDF = df.withColumn("allTextString", udf_flatten_text(df.allTextArray)) df = newDF df.show(10, False) tokenizer = RegexTokenizer(inputCol="allTextString", outputCol="word_tokens", pattern="\\W") TokenizerData = tokenizer.transform(df) df = TokenizerData remover = StopWordsRemover(inputCol="word_tokens", outputCol="stop_removed") my_sw = [ 'united', 'states', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' ] sw = remover.loadDefaultStopWords("english") remover.setStopWords(sw + my_sw) StopWordsRemoverData = remover.transform(df) df = StopWordsRemoverData cv = CountVectorizer(inputCol="stop_removed",
return "../examples/smalldata/" + file_name ## This method loads the data, perform some basic filtering and create Spark's dataframe def load(): row_rdd = spark.sparkContext.textFile(_locate("smsData.txt")).map(lambda x: x.split("\t", 1)).filter(lambda r: r[0].strip()) return spark.createDataFrame(row_rdd, ["label", "text"]) ## ## Define the pipeline stages ## ## Tokenize the messages tokenizer = RegexTokenizer(inputCol="text", outputCol="words", minTokenLength=3, gaps=False, pattern="[a-zA-Z]+") ## Remove ignored words stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model
sqlContext = SQLContext(sc) from pyspark.ml.feature import Tokenizer, RegexTokenizer from pyspark.sql.functions import col, udf from pyspark.sql.types import IntegerType from pyspark.ml.feature import StopWordsRemover from pyspark.ml.feature import HashingTF, IDF from pyspark.sql import SparkSession # Build a SparkSession; SparkSession provides a single point of entry to interact with underlying Spark functionality spark = SparkSession\ .builder\ .appName("similairityExample")\ .getOrCreate() df = sqlContext.read.json('/home/sl4401/AA/wiki_**') df = df.limit(1000) regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="[^A-Za-z]+", toLowercase=True) tokenized_data = regexTokenizer.transform(df) stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered_words") filtered_data = stopWordsRemover.transform(tokenized_data) hashingTF = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=20) featurizedData = hashingTF.transform(filtered_data) idf= IDF(inputCol="raw_features", outputCol="features") idfModel = idf.fit(featurizedData) featurized_data = idfModel.transform(featurizedData) from pyspark.ml.feature import Normalizer normalizer = Normalizer(inputCol="features", outputCol="norm") data = normalizer.transform(featurized_data) import math import pyspark.sql.functions as psf from pyspark.sql.types import DoubleType dot_udf = psf.udf(lambda x,y: float(x.dot(y)), DoubleType())
"C:/Users/yuy/Desktop/kaggle/movie/test.tsv", format='com.databricks.spark.csv', header="true", delimiter="\t", inferSchema="true", mode="DROPMALFORMED") train_raw = raw_data.select("PhraseId", "SentenceId", "Phrase") train_size = train_raw.count() test_size = test_data.count() #concate the two data set for tokenization and countvector total_data = train_raw.union(test_data) #regex tokenization tokenizer = RegexTokenizer(inputCol="Phrase", outputCol="words", pattern="\\w+", gaps=False) wordsDF = tokenizer.transform(total_data) #wordsDF.select("words").show(3, False) #dont remove stop words because in test file there's stop words to be predicted #count vectorized the word tokens cv = CountVectorizer(inputCol="words", outputCol="features") word_vec = cv.fit(wordsDF).transform(wordsDF).select( "features", "PhraseId") #word_vec.select("features").show(3) #IDF idf = IDF(inputCol="features", outputCol="IDF_features") idfModel = idf.fit(word_vec).transform(word_vec)
header = False, escape = "\"", schema = StructType([StructField("reviewId", IntegerType(), True), StructField("asin", StringType(), True), StructField("reviewText", StringType(), True)])) df = df.drop("asin") df = df.repartition(20) # # Use nltk.word_tokenizer to tokenize words # @udf(ArrayType(StringType())) # def tokenize(string): # return word_tokenize(string) # df = df.withColumn("words", tokenize("reviewText")) df = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W").transform(df) df = df.drop("reviewText") cv_model = CountVectorizer(inputCol="words", outputCol="tf").fit(df) vocabulary = cv_model.vocabulary df = cv_model.transform(df) df = df.drop("words") df.cache() df = IDF(inputCol="tf", outputCol="tfidf").fit(df).transform(df) df = df.drop("tf") df.unpersist() @udf(MapType(StringType(), FloatType())) def create_map(vector):
), (2, "\"You don't mind your honor?\" he asked Tushin. \"I've lost my company, your honor. I don't know where... such bad luck!'\"" ) ], "id int, message string") df.show(truncate=False) #A tokenizer that converts the input string to lowercase and then splits it by white spaces. words = Tokenizer(inputCol="message", outputCol="words").transform(df) words.show(truncate=False) # 正規表達法資料清洗及斷詞 # A regex based tokenizer that extracts tokens either by using the provided regex pattern (in Java dialect) to split the text (default) or repeatedly matching the regex (if gaps is false). Optional parameters also allow filtering tokens using a minimal length. It returns an array of strings that can be empty. words = RegexTokenizer(inputCol="message", outputCol="words", pattern="\\W+").transform(df) words.show(truncate=False) # StopWordsRemover is feature transformer that filters out stop words from input. stop_words_removed = StopWordsRemover( inputCol="words", outputCol="stop_words_removed").transform(words) stop_words_removed.show(truncate=False) # 變成n字一組 # NGram is a feature transformer that converts the input array of strings into an array of n-grams. Null values in the input array are ignored. It returns an array of n-grams where each n-gram is represented by a space-separated string of words. When the input is empty, an empty array is returned. When the input array length is less than n (number of elements per n-gram), no n-grams are returned. ngram_df = NGram(n=2, inputCol="words", outputCol="ngrams").transform(words) ngram_df.show(truncate=False) ngram_df.select("ngrams").show(truncate=False)
StructField("dateline", StringType(), True), StructField("headline", StringType(), True), StructField("metadata", StringType(), True), StructField("dc", StringType(), True), StructField("text", StringType(), False), StructField("title", StringType(), True), ]) #customized dataFrame xmlDf = spark.read.format('com.databricks.spark.xml').options( rowTag='newsitem').load('hdfs://hadoop-dbse/user/pasumart/input_dataset/*', schema=custom).limit(10000) df_first = xmlDf.select(col('_itemid').alias('DocId'), 'text') tokens = RegexTokenizer(minTokenLength=2, inputCol='text', outputCol='Words', pattern="[^a-z]+") #tokenizing the sentences to words Tokenized = tokens.transform(df_first) Tokens_filtered = StopWordsRemover( inputCol='Words', outputCol='filtered_words') #filtering the words that contains stopwords Tokenized_filtered = Tokens_filtered.transform(Tokenized) cv = CountVectorizer( inputCol="filtered_words", outputCol="features" ) #constructing a matrix that maps each unique word to a unique ID cv_model = cv.fit(Tokenized_filtered)
# In[8]: from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler from pyspark.ml.feature import StopWordsRemover, Word2Vec, RegexTokenizer from pyspark.sql.functions import udf, col, lower, regexp_replace #Cleaning the text replacing every thing except [^a-zA-Z\\s] with a empty space data_clean = data.select( 'id', (lower(regexp_replace('tweet', "[^a-zA-Z\\s]", "")).alias('tweet'))) # In[9]: #Now building a model pipeline stage1 = RegexTokenizer(inputCol="tweet", outputCol="tokens", pattern="\\W") stage2 = StopWordsRemover(inputCol="tokens", outputCol="filtered_words") stage3 = Word2Vec(inputCol="filtered_words", outputCol="vector", vectorSize=1000) # Can also use the below code for text preprocessing and cleaning # In[ ]: #from nltk.stem.snowball import SnowballStemmer #stemmer = SnowballStemmer(language='english') #Tokenizing the text #tokenizer = Tokenizer(inputCol="tweet", outputCol="words_token") #df_words_token = tokenizer.transform(data_clean).select('id', 'words_token')
def content_userid(self, file1, file2, input_model, u_id, sim_bus_limit=3): from pyspark import SparkContext from pyspark.sql import SparkSession sparkconf_builder = spark_celery_app.sparkconf_builder spark_conf = sparkconf_builder() sc = SparkContext.getOrCreate(conf=spark_conf) spark = SparkSession.builder.config(conf=spark_conf).getOrCreate() data = spark.read.parquet(file1) data.createOrReplaceTempView('review') df_business = spark.read.parquet(file2) schema = StructType([ StructField("business_id", StringType(), True), StructField("score", IntegerType(), True), StructField("input_business_id", StringType(), True) ]) similar_businesses_df = spark.createDataFrame([], schema) df = data.select('business_id', 'text') #df_review = df.groupby('business_id').agg(functions.collect_set('text')).show(100) review_rdd = df.rdd.map(tuple).reduceByKey(operator.add) review_df = spark.createDataFrame(review_rdd).withColumnRenamed( '_1', 'business_id').withColumnRenamed('_2', 'text') # create text preprocessing pipeline # Build the pipeline # tokenize review regexTokenizer = RegexTokenizer(gaps=False, pattern='\w+', inputCol='text', outputCol='text_token') #yelpTokenDF = regexTokenizer.transform(review_df) # filter stopwords stopWordsRemover = StopWordsRemover(inputCol='text_token', outputCol='nonstopwrd') #yelp_remove_df = stopWordsRemover.transform(yelpTokenDF) # TF countVectorizer = CountVectorizer(inputCol='nonstopwrd', outputCol='raw_features', minDF=2) #yelp_CountVec = cv.transform(yelp_remove_df) # IDF idf = IDF(inputCol="raw_features", outputCol="idf_vec") word2Vec = Word2Vec(vectorSize=500, minCount=5, inputCol='nonstopwrd', outputCol='word_vec', seed=123) #vectorAssembler = VectorAssembler(inputCols=['idf_vec', 'word_vec'], outputCol='comb_vec') pipeline = Pipeline(stages=[ regexTokenizer, stopWordsRemover, countVectorizer, idf, word2Vec ]) #pipeline_model = pipeline.fit(review_df) #pipeline_model.write().overwrite().save('content_userid') pipeline_model = PipelineModel.load(input_model) reviews_by_business_df = pipeline_model.transform(review_df) all_business_vecs = reviews_by_business_df.select( 'business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect() usr_rev_bus = spark.sql( 'SELECT distinct business_id FROM review where stars >= 3.0 and user_id = "{}"' .format(u_id)) bus_list = [i for i in usr_rev_bus.collect()] for b_id in bus_list: input_vec = [(r[1]) for r in all_business_vecs if r[0] == b_id[0]][0] similar_business_rdd = sc.parallelize( (i[0], float(CosineSim(input_vec, i[1]))) for i in all_business_vecs) similar_business_df = spark.createDataFrame( similar_business_rdd).withColumnRenamed( '_1', 'business_id').withColumnRenamed('_2', 'score').orderBy( "score", ascending=False) similar_business_df = similar_business_df.filter( col("business_id") != b_id[0]).limit(10) similar_business_df = similar_business_df.withColumn( 'input_business_id', lit(b_id[0])) # get restaurants similar to the user_id result = similar_businesses_df.union(similar_business_df) #result.cache() # filter out those have been reviewd before by the user d = [i[0] for i in usr_rev_bus.collect()] df_1 = result.filter(~(col('business_id').isin(d))).select( 'business_id', 'score') #df_1= result.join(usr_rev_bus, 'business_id', 'left_outer').where(col("usr_rev_bus.business_id").isNull()).select([col('result.business_id'),col('result.score')]) df_2 = df_1.orderBy("score", ascending=False).limit(sim_bus_limit) df_result = df_business.join(df_2, 'business_id', 'right').select('business_id', 'score', 'name', 'categories', 'latitude', 'longitude') df_result.show() df_result = df_result.collect() return df_result
# MAGIC %sql SELECT count(1), rating FROM reviews WHERE review LIKE '%great%' GROUP BY rating ORDER BY rating # COMMAND ---------- # MAGIC %sql SELECT count(1), rating FROM reviews WHERE review LIKE '%poor%' GROUP BY rating ORDER BY rating # COMMAND ---------- # MAGIC %md #NLP Pipeline # COMMAND ---------- from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer tokenizer = RegexTokenizer() \ .setInputCol("review") \ .setOutputCol("tokens") \ .setPattern("\\W+") remover = StopWordsRemover() \ .setInputCol("tokens") \ .setOutputCol("stopWordFree") \ counts = CountVectorizer() \ .setInputCol("stopWordFree") \ .setOutputCol("features") \ .setVocabSize(1000) # COMMAND ---------- from pyspark.ml.feature import Binarizer
# <h3><b>6. Creating a Dataframe of input data and topic</b></h3> # In[29]: data = sqlContext.createDataFrame(dataList, ["index", "data","topic"]) data.show(5) # <h3><b>7. Splitting data and converting into matrix of token counts</b></h3> # In[35]: # regular expression tokenizer regexTokenizer = RegexTokenizer(inputCol="data", outputCol="words", pattern="\\W") # stop words add_stopwords = ["http","https","amp","rt","t","c","the"] stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords) # bag of words count countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) # <b><h3>8. Creating pipeline of stages in the order of execution</h3></b> # In[36]: hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000) idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])
from pyspark.ml.feature import Tokenizer, RegexTokenizer from pyspark.sql.functions import col, udf from pyspark.sql.types import IntegerType from pyspark.shell import spark sentenceDataFrame = spark.createDataFrame( [(0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat")], ["id", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W") # alternatively, pattern="\\w+", gaps(False) countTokens = udf(lambda words: len(words), IntegerType()) tokenized = tokenizer.transform(sentenceDataFrame) tokenized.select("sentence", "words") \ .withColumn("tokens", countTokens(col("words"))).show(truncate=False) regexTokenized = regexTokenizer.transform(sentenceDataFrame) regexTokenized.select("sentence", "words") \ .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
.withColumnRenamed('lowerText', 'text')) print parsed.columns, '\n\n' print parsed.select('text').first() # COMMAND ---------- # MAGIC %md # MAGIC Next, let's convert our text into a list of words so that we can perform some analysis at the word level. For this we will use a feature transformer called `RegexTokenizer` which splits up strings into tokens (words in our case) based on a split pattern. We'll split our text on anything that matches one or more non-word characters. # COMMAND ---------- from pyspark.ml.feature import RegexTokenizer tokenizer = (RegexTokenizer() .setInputCol('text') .setOutputCol('words') .setPattern('\\W+')) wordsDF = tokenizer.transform(parsed) # COMMAND ---------- wordsDF.select('words').first() # COMMAND ---------- # MAGIC %md # MAGIC There are some very common words in our list of words which won't be that useful for our later analysis. We'll create a UDF to remove them. # MAGIC # MAGIC [StopWordsRemover](http://spark.apache.org/docs/latest/ml-features.html#stopwordsremover) is implemented for Scala but not yet for Python. We'll use the same [list](http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words) of stop words it uses to build a user-defined function (UDF). # COMMAND ----------
def train_preproecessing(self, content): ## Load files X_files = sc.textFile('gs://chatrath/files/X_train.txt') X_asm_files = X_files.map(lambda x: (("gs://chatrath/data/asm/" + x + ".asm"))) X_asm_files = X_asm_files.reduce(lambda x, y: x + "," + y) X_asm = sc.wholeTextFiles(X_asm_files) X_train_asm = X_asm.mapValues(lambda x: re.sub("""[\t{Z}]""", "", x)) X_train_asm = X_train_asm.mapValues( lambda x: re.sub("""[+{Z}]+""", "", x)) X_train_asm = X_train_asm.mapValues( lambda x: re.sub("""[-{Z}]+""", "", x)) X_train_asm = X_train_asm.mapValues( lambda x: re.sub("""[={Z}]+""", "", x)) X_train_asm = X_train_asm.mapValues( lambda x: re.sub("""[\r|{Z}]+""", "", x)) X_train_asm = X_train_asm.mapValues( lambda x: re.sub("""[;{Z}]+""", "", x)) X_train_asm = X_train_asm.mapValues( lambda x: re.sub("""[\n{Z}]+""", "", x)) X_train_asm = X_train_asm.mapValues(lambda x: x.split()) ## Filter and Extract opcode X_train_asm = X_train_asm.mapValues( lambda x: list(filter(lambda y: y in content, x))) X_train_asm = X_train_asm.mapValues(lambda x: " ".join(map(str, x))) X_train_asm = X_train_asm.map( lambda x: (x[0].split("/")[-1].split(".")[0], x[1])) ## Create Dataframe asm_df = X_train_asm.map( lambda x: Row(filename=x[0], data=x[1])).toDF() ## Load lables rddy = sc.textFile("gs://chatrath/files/y_train.txt") ## Ensuring lables are in order X_files = X_files.zipWithIndex() rddy = rddy.zipWithIndex() dfy = rddy.map( lambda line: Row(label=line[0], id1=line[1])) #(id,label) dfy = dfy.toDF() dfx = X_files.map( lambda line: Row(file=line[0], id=line[1])).toDF() #(id,filename) resultantdf = dfx.alias('a').join( dfy.alias('b'), col('b.id1') == col('a.id')).drop('id').drop( 'id1') #(filename,label) resultantdf = resultantdf.join( asm_df, asm_df.filename == resultantdf.file).drop('file').drop( 'filename') #(data,label) ## Tokenizing data regexToken = RegexTokenizer(inputCol="data", outputCol="words", pattern="\\W") asm_df = regexToken.transform(resultantdf) asm_df = asm_df.drop('data') ## Using CountVectorizer to extract features countvector = CountVectorizer(inputCol="words", outputCol="features") cv = countvector.fit(asm_df) asm_df = cv.transform(asm_df) traindata = asm_df.withColumn('label', resultantdf['label'].cast('int')) return traindata, cv
from pyspark.sql.functions import col parsed = dfSmall.filter((col("title") != "<PARSE ERROR>") & col("redirect_title").isNull() & col("text").isNotNull()) parsed.take(1) # COMMAND ---------- # MAGIC %md # MAGIC Use a regular expression to tokenize (split into words). Pattern defaults to matching the separator, but can be set to match tokens instead. # COMMAND ---------- from pyspark.ml.feature import RegexTokenizer tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W+") # COMMAND ---------- # MAGIC %md # MAGIC Create a `HashingTF` transformer to hash words to buckets with counts, then use an `IDF` estimator to compute inverse-document frequency for buckets based on how frequently words have hashed to those buckets in the given documents. Next, normalize the tf-idf values so that the \\( l^2 \\) norm is one for each row. # COMMAND ---------- from pyspark.ml.feature import IDF, HashingTF, Normalizer hashingTF = HashingTF().setNumFeatures(10000).setInputCol(tokenizer.getOutputCol()).setOutputCol("hashingTF") idf = IDF().setMinDocFreq(10).setInputCol(hashingTF.getOutputCol()).setOutputCol("idf") normalizer = Normalizer().setInputCol(idf.getOutputCol()).setOutputCol("features")