def fit_kmeans(spark, products_df): step = 0 step += 1 tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer") step += 1 stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords") step += 1 tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16) step += 1 idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf") step += 1 normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer") step += 1 kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20) kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans]) model = kmeans_pipeline.fit(products_df) words_prediction = model.transform(products_df) model.save("./kmeans") # the whole machine learning instance is saved in a folder return model, words_prediction
def token(dataframe, in_col, out_col): tokenizer = Tokenizer(inputCol=in_col, outputCol=out_col) dataframe = tokenizer.transform(dataframe) dataframe.printSchema() return dataframe
def textPredict(request): """6.文本聚类,热度预测""" label = request.POST['label'] title = request.POST['title'] conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) """处理数据集,生成特征向量""" dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet') print(dfTitles.dtypes) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(dfTitles) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show() for features_label in rescaledData.select("features", "rawFeatures").take(3): print(features_label) """决策树模型培训""" labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData) featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData) (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3]) dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) model = pipeline.fit(trainingData) """模型测试""" predictions = model.transform(testData) predictions.show() predictions.select("prediction", "indexedLabel", "features").show(5) """用户数据测试,单个新闻测试""" sentenceData = sqlContext.createDataFrame([ (label,title), ],['label',"title"]) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) rescaledData = idfModel.transform(featurizedData) myprediction = model.transform(rescaledData) print("==================================================") myprediction.show() resultList = convertDfToList(myprediction) """模型评估""" evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print("Test Error = %g " % (1.0 - accuracy)) treeModel = model.stages[2] print(treeModel) sc.stop() return render(request,{'resultList':resultList})
def main(): spark = SparkSession.builder.appName("DBPediaSpark").getOrCreate() args = getResolvedOptions(sys.argv, ['S3_INPUT_BUCKET', 'S3_INPUT_KEY_PREFIX', 'S3_OUTPUT_BUCKET', 'S3_OUTPUT_KEY_PREFIX', 'S3_MODEL_BUCKET', 'S3_MODEL_KEY_PREFIX']) # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format spark.sparkContext._jsc.hadoopConfiguration().set("mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter") # Defining the schema corresponding to the input data. The input data does not contain the headers schema = StructType([StructField("label", IntegerType(), True), StructField("title", StringType(), True), StructField("abstract", StringType(), True)]) # Download the data from S3 into two separate Dataframes traindf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'], 'train.csv')), header=False, schema=schema, encoding='UTF-8') validationdf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'], 'test.csv')), header=False, schema=schema, encoding='UTF-8') # Tokenize the abstract column which contains the input text tokenizer = Tokenizer(inputCol="abstract", outputCol="tokenized_abstract") # Save transformed training data to CSV in S3 by converting to RDD. transformed_traindf = tokenizer.transform(traindf) transformed_train_rdd = transformed_traindf.rdd.map(lambda x: (x.label, x.tokenized_abstract)) lines = transformed_train_rdd.map(csv_line) lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'train')) # Similar data processing for validation dataset. transformed_validation = tokenizer.transform(validationdf) transformed_validation_rdd = transformed_validation.rdd.map(lambda x: (x.label, x.tokenized_abstract)) lines = transformed_validation_rdd.map(csv_line) lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'validation')) # Serialize the tokenizer via MLeap and upload to S3 SimpleSparkSerializer().serializeToBundle(tokenizer, "jar:file:/tmp/model.zip", transformed_validation) # Unzip as SageMaker expects a .tar.gz file but MLeap produces a .zip file. import zipfile with zipfile.ZipFile("/tmp/model.zip") as zf: zf.extractall("/tmp/model") # Write back the content as a .tar.gz file import tarfile with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar: tar.add("/tmp/model/bundle.json", arcname='bundle.json') tar.add("/tmp/model/root", arcname='root') s3 = boto3.resource('s3') file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz') s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz', file_name)
def run_tf_idf_spark_ml(df, numFeatures=1 << 20): tokenizer = Tokenizer(inputCol="body", outputCol="words") wordsData = tokenizer.transform(df) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numFeatures) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) return idfModel.transform(featurizedData)
def predictLabel(label,title,model): """预测新闻的标签""" sentenceData = sqlContext.createDataFrame([ (label,title), ],['label',"title"]) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) rescaledData = idfModel.transform(featurizedData) myprediction = model.transform(rescaledData) return myprediction
def create_features(raw_data): #Create DataFrame data_df = sqlContext.createDataFrame(raw_data.map(lambda r : Row(appid=r[0], price=r[1], sentence=r[2]))) #Transform sentence into words tokenizer = Tokenizer(inputCol='sentence', outputCol='words') words_df = tokenizer.transform(data_df) #Calculate term frequency hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=5) featurized_df = hashingTF.transform(words_df) #Calculate inverse document frequency idf = IDF(inputCol='rawFeatures', outputCol='features') idfModel = idf.fit(featurized_df) return idfModel.transform(featurized_df)
def preprocessing_titles(path,name): query = preprocessData(path) tokenizer = Tokenizer(inputCol="title", outputCol="tokenized_title") wordsData = tokenizer.transform(query) #after Stopword removal remover = StopWordsRemover(inputCol="tokenized_title", outputCol="filtered") wordsData= remover.transform(wordsData) df = wordsData.map(lambda x:x['id']).zipWithUniqueId().toDF(["id","index"]) df.registerTempTable("indices") wordsData.registerTempTable("words") qr = sqlContext.sql("SELECT index,words.id,filtered FROM indices JOIN words ON words.id = indices.id") if name!='': exportOnS3(qr,"s3a://redit-preprocessed/",name) qr = qr.map(lambda Row:(Row['index'],Row['id'],Row['filtered']))
def _build_stages(self): self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed") self.tokenizer = Tokenizer(inputCol=self.bs_parser.getOutputCol(), outputCol="words") self.hashing_tf = HashingTF(inputCol=self.tokenizer.getOutputCol(), outputCol="raw_features") self.idf_model = IDF(inputCol=self.hashing_tf.getOutputCol(), outputCol="features") self.lr = LogisticRegression(maxIter=10, regParam=0.01) return [self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr]
def getPipeline(self, df): # notify pipeline self.success('Initializing ML Pipeline ...') # initialize our tokenizer, we're going to tokenize features tokenizer = Tokenizer(inputCol='tag_features', outputCol='words') # convert the tokenize data to vectorize data hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='features') # initialize logistic regression algorithm lr = LogisticRegression(maxIter=10, regParam=0.01) # create / initialize the ml pipeline pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # fit the pipeline on our training dataframe model = pipeline.fit(df) return model
def get_top_words(dataset, signatures): # TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer # Or translate comments in other languages using the free Microsoft Translate API. sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1))) if sentenceData.rdd.isEmpty(): return dict() # Tokenize comments. tokenizer = Tokenizer(inputCol='user_comments', outputCol='words') wordsData = tokenizer.transform(sentenceData) # Remove duplicate words from comments. wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words']) if wordsData.rdd.isEmpty(): print("[WARNING]: wordsData is empty, sentenceData wasn't.") return dict() # Clean comment words by removing puntuaction and stemming. def clean_word(w): return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower())) wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words']) # XXX: Useless with TF-IDF? remover = StopWordsRemover(inputCol='words', outputCol='filtered') cleanWordsData = remover.transform(wordsData) cv = CountVectorizer(inputCol='filtered', outputCol='features') model = cv.fit(cleanWordsData) featurizedData = model.transform(cleanWordsData) idf = IDF(inputCol='features', outputCol='tfidf_features') idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect() return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
def main(): ''' takes one input argument :: Location of the directory for training and test data files. :return: Print output on console for the area under the ROC curve. ''' conf = SparkConf().setAppName("MLPipeline") sc = SparkContext(conf=conf) # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet("20news_train.parquet") # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr = LogisticRegression(maxIter=20, regParam=0.1) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training data. model = pipeline.fit(trainDF) numFeatures = (1000, 5000, 10000) regParam = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9) paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, numFeatures).addGrid(lr.regParam, regParam).build() cv = CrossValidator().setEstimator(pipeline).setEvaluator(BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2) # Evaluate the model on testing data testDF = sqlCt.read.parquet("20news_test.parquet") prediction = model.transform(testDF) evaluator = BinaryClassificationEvaluator() model_cv = cv.fit(trainDF) prediction_cv = model_cv.transform(testDF) print evaluator.evaluate(prediction) print evaluator.evaluate(prediction_cv)
def run_tf_idf_spark_mllib(df, numFeatures=1 << 20): tokenizer = Tokenizer(inputCol="body", outputCol="words") wordsData = tokenizer.transform(df) words = wordsData.select("words").rdd.map(lambda x: x.words) hashingTF = MllibHashingTF(numFeatures) tf = hashingTF.transform(words) tf.cache() idf = MllibIDF().fit(tf) tfidf = idf.transform(tf) # @TODO make this nicer tmp = sqlContext.createDataFrame(wordsData.rdd.zip(tfidf), ["data", "features"]) tmp.registerTempTable("tmp") old_columns = ', '.join(map(lambda x: 'data.%s' % x, wordsData.columns)) with_features = sqlContext.sql("SELECT %s, features FROM tmp" % old_columns) tmp = sqlContext.createDataFrame(with_features.rdd.zip(tf), ["data", "rawFeatures"]) tmp.registerTempTable("tmp") old_columns = ', '.join(map(lambda x: 'data.%s' % x, with_features.columns)) return sqlContext.sql("SELECT %s, rawFeatures FROM tmp" % old_columns)
def main(): # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet(training_input) testDF = sqlCt.read.parquet(testing_input) tokenizer = Tokenizer(inputCol="text", outputCol="words") evaluator = BinaryClassificationEvaluator() # no parameter tuning hashingTF_notuning = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr_notuning = LogisticRegression(maxIter=20, regParam=0.1) pipeline_notuning = Pipeline(stages=[tokenizer, hashingTF_notuning, lr_notuning]) model_notuning = pipeline_notuning.fit(trainDF) prediction_notuning = model_notuning.transform(testDF) notuning_output = evaluator.evaluate(prediction_notuning) # for cross validation hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=20) paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures, [1000, 5000, 10000])\ .addGrid(lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\ .build() pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2) cvModel = cv.fit(trainDF) # Make predictions on test documents. cvModel uses the best model found. best_prediction = cvModel.transform(testDF) best_output = evaluator.evaluate(best_prediction) s = str(notuning_output) + '\n' + str(best_output) output_data = sc.parallelize([s]) output_data.saveAsTextFile(output)
class BaselinePipelineEngine(PipelineEngine): @keyword_only def __init__(self, cv): super(BaselinePipelineEngine, self).__init__(cv) self.hashing_tf_map = [pow(2, 20)] self.lr_map = [0.1, 0.01] self.stages = self._build_stages() self.pipeline = Pipeline(stages=[self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr]) self.param_grid = self._build_param_grid() def _build_stages(self): self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed") self.tokenizer = Tokenizer(inputCol=self.bs_parser.getOutputCol(), outputCol="words") self.hashing_tf = HashingTF(inputCol=self.tokenizer.getOutputCol(), outputCol="raw_features") self.idf_model = IDF(inputCol=self.hashing_tf.getOutputCol(), outputCol="features") self.lr = LogisticRegression(maxIter=10, regParam=0.01) return [self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr] def _build_param_grid(self): param_grid_builder = ParamGridBuilder() param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map) param_grid_builder.addGrid(self.lr.regParam, self.lr_map) return param_grid_builder.build()
path = './txt_p' files = [f for f in os.listdir(path) if os.path.split(f)] filecontent = len(files) dataset = [] cont = 0 for f in files: j = os.path.join(path, f) with open(j, 'r') as myfile: data = myfile.read().replace('\n', '') cont = cont + 1 dataset.append((cont, f, data)) rdd = sc.parallelize(dataset) shemaData = rdd.map(lambda x: Row(num=x[0], title=x[1], text=x[2])) dataFrame = sqlContext.createDataFrame(shemaData) tokenizer = Tokenizer(inputCol="text", outputCol="words") wordsData = tokenizer.transform(dataFrame) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.select("title", "features").show() #Normalizacion y transformada de la matriz normalizer = Normalizer(inputCol="features", outputCol="norm") data = normalizer.transform(rescaledData) #Proceso de similaridad hallando la norma y el producto punto mat = IndexedRowMatrix( data.select("num", "norm")\ .rdd.map(lambda row: IndexedRow(row.num, row.norm.toArray()))).toBlockMatrix()
def test_gen_estimator_metadata(spark_session): # pylint: disable=unused-argument tokenizer1 = Tokenizer(inputCol="text1", outputCol="words1") hashingTF1 = HashingTF(inputCol=tokenizer1.getOutputCol(), outputCol="features1") tokenizer2 = Tokenizer(inputCol="text2", outputCol="words2") hashingTF2 = HashingTF(inputCol=tokenizer2.getOutputCol(), outputCol="features2") vecAssembler = VectorAssembler(inputCols=["features1", "features2"], outputCol="features") lor = LogisticRegression(maxIter=10) ova = OneVsRest(classifier=lor) sub_pipeline1 = Pipeline(stages=[tokenizer1, hashingTF1]) sub_pipeline2 = Pipeline(stages=[tokenizer2, hashingTF2]) sub_pipeline3 = Pipeline(stages=[vecAssembler, ova]) paramGrid = (ParamGridBuilder().addGrid(lor.maxIter, [10, 20]).addGrid( lor.regParam, [0.1, 0.01]).build()) eva = MulticlassClassificationEvaluator() crossval = CrossValidator(estimator=sub_pipeline3, estimatorParamMaps=paramGrid, evaluator=eva, numFolds=2) top_pipeline = Pipeline(stages=[sub_pipeline1, sub_pipeline2, crossval]) metadata = _gen_estimator_metadata(top_pipeline) expected_hierarchy = { "name": "Pipeline_1", "stages": [ { "name": "Pipeline_2", "stages": [{ "name": "Tokenizer_1" }, { "name": "HashingTF_1" }] }, { "name": "Pipeline_3", "stages": [{ "name": "Tokenizer_2" }, { "name": "HashingTF_2" }] }, { "name": "CrossValidator", "evaluator": { "name": "MulticlassClassificationEvaluator" }, "tuned_estimator": { "name": "Pipeline_4", "stages": [ { "name": "VectorAssembler" }, { "name": "OneVsRest", "classifier": { "name": "LogisticRegression" } }, ], }, }, ], } assert metadata.hierarchy == expected_hierarchy assert metadata.uid_to_indexed_name_map == { top_pipeline.uid: "Pipeline_1", sub_pipeline1.uid: "Pipeline_2", tokenizer1.uid: "Tokenizer_1", hashingTF1.uid: "HashingTF_1", sub_pipeline2.uid: "Pipeline_3", tokenizer2.uid: "Tokenizer_2", hashingTF2.uid: "HashingTF_2", crossval.uid: "CrossValidator", sub_pipeline3.uid: "Pipeline_4", vecAssembler.uid: "VectorAssembler", ova.uid: "OneVsRest", lor.uid: "LogisticRegression", eva.uid: "MulticlassClassificationEvaluator", } assert (metadata.uid_to_indexed_name_map[ metadata.param_search_estimators[0].uid] == "CrossValidator")
df_news = sqlContext.sql("SELECT Date, Top1,Top2,Top25 FROM combined_news_djia_csv") num_word_features = 2000 #news data only goes to july 2016 df_news = sqlContext.sql("SELECT * FROM combined_news_djia_csv") df_news = df_news.select("Date",concat(col("Top1"), lit(" "), col("Top2"), lit(" "), col("Top3"), lit(" "), col("Top4"), lit(" "), col("Top5"), lit(" "), col("Top6"), lit(" "), col("Top7"), lit(" "), col("Top8"), lit(" "), col("Top9"), lit(" "), col("Top10"), lit(" "), col("Top11"), lit(" "), col("Top12"), lit(" "), col("Top13"), lit(" "), col("Top14"), lit(" "), col("Top15"), lit(" "), col("Top16"), lit(" "), col("Top17"), lit(" "), col("Top18"), lit(" "), col("Top19"), lit(" "), col("Top20"), lit(" "), col("Top21"), lit(" "), col("Top22"), lit(" "), col("Top23"), lit(" "), col("Top24"), lit(" "), col("Top25")).alias("all_text_dirty")) df_news = df_news.withColumn("all_text_1",regexp_replace(col("all_text_dirty"), "['\"]", "")) df_news = df_news.withColumn("all_text",expr("substring(all_text_1, 2, length(all_text_1)+1)")) df_news = df_news.dropna() tokenizer = Tokenizer(inputCol="all_text", outputCol="words") wordsData = tokenizer.transform(df_news) remover = StopWordsRemover(inputCol="words", outputCol="wordsFil") wordsDataFil = remover.transform(wordsData) hashingTF = HashingTF(inputCol="wordsFil", outputCol="rawFeatures", numFeatures=num_word_features) featurizedData = hashingTF.transform(wordsDataFil) # alternatively, CountVectorizer can also be used to get term frequency vectors idf = IDF(inputCol="rawFeatures", outputCol="news_features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) #df_news = rescaledData.select("Date","news_features")
#Import csv for training data start_data = spark.read.format("csv").option("header", "true").load("data/sepsis.csv") #DATA CLEANUP #Remove NULLs start_data = start_data.na.drop(subset=["CATEGORY","COMMENT"]) #Filter to ensure that category is pulled in correctly start_data = start_data.filter(start_data['CATEGORY'].isin('include','exclude')) #BUILD FEATURES # Create a length column to be used as a future feature from pyspark.sql.functions import length data = start_data.withColumn('length', length(start_data['COMMENT'])) # Create all the features to the data set include_exclude_to_num = StringIndexer(inputCol='CATEGORY',outputCol='label') tokenizer = Tokenizer(inputCol="COMMENT", outputCol="token_text") stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens') hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token') idf = IDF(inputCol='hash_token', outputCol='idf_token') # Create feature vectors # See https://spark.apache.org/docs/latest/ml-features.html#vectorassembler # This just creates a new, single vector of features that is the concatenation # of tf-idf data and the length of the email clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features') #DATA PROCESSING PIPELINE # Create a and run a data processing Pipeline # See https://spark.apache.org/docs/latest/ml-pipeline.html#pipeline data_prep_pipeline = Pipeline(stages=[include_exclude_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])
# removed stop words # applied the hashing trick # converted the data from counts to IDF and # trained a linear regression model. # Each of these steps was done independently. This seems like a great application for a pipeline! # Instructions # 100 XP # Instructions # 100 XP # Create an object for splitting text into tokens. # Create an object to remove stop words. Rather than explicitly giving the input column name, use the getOutputCol() method on the previous object. # Create objects for applying the hashing trick and transforming the data into a TF-IDF. Use the getOutputCol() method again. # Create a pipeline which wraps all of the above steps as well as an object to create a Logistic Regression model. from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF # Break text into tokens at non-word characters tokenizer = Tokenizer(inputCol='text', outputCol='words') # Remove stop words remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms') # Apply the hashing trick and transform to TF-IDF hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash") idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features") # Create a logistic regression object and add everything to a pipeline logistic = LogisticRegression() pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])
def sentence_data(df_data): df_data2 = df_data.select(df_data._id, removepunctuations(df_data.text_entry)) only_words = Tokenizer(inputCol='textentry', outputCol="words") df_data3 = only_words.transform(df_data2) return df_data3
def trainModel(self): logger.info("Training the model...") query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;''' def SQLtoURL(query): data = query.replace('\n', ' ').replace('\t',' ').replace(' ',' ').replace(' ',' ') return data def QueryXXXXX(query, file = None): session = Session() response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},) return response.content table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table'] title_list = [x['c'] for x in table['rows']] table_cols = [d['label'] for d in table['cols']] def convert_row(row): rowlist = [d['v'] for d in row] return rowlist rd = self.sc.parallelize(title_list).map(convert_row) titleData = self.spark.createDataFrame(rd, table_cols) titleData = titleData.dropna() hebrew_stopwords = stop_words() def rmv(words): for punc in punctuation: words = words.replace(punc,"") for hword in hebrew_stopwords: words = words.replace(hword, " ") return words self.spark.udf.register("rmv", rmv, StringType()) titleData.registerTempTable("wordstable") cleanedSentenceData = self.spark.sql("select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable") tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words") wordsData = tokenizer.transform(cleanedSentenceData) cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF = 2.0) cvModel = cv.fit(wordsData) featurizedData = cvModel.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) lda = LDA(k=100) ldaModel = lda.fit(rescaledData) postFactorizedData = ldaModel.transform(rescaledData) norm = Normalizer(inputCol = "topicDistribution", outputCol="normTopicDist") scaledFactorizedNormalizedData = norm.transform(postFactorizedData) self.model = scaledFactorizedNormalizedData logger.info("model is built!")
def main(): set_pandas_options() app_name = "Case Study 2: Email Analytics" conf = SparkConf().setAppName(app_name) conf = (conf.setMaster('local[*]').set( "spark.driver.host", "localhost").set('spark.executor.memory', '4G').set('spark.driver.memory', '8G').set('spark.driver.maxResultSize', '10G')) sc = SparkContext(conf=conf) spark = SparkSession(sc) log4jLogger = sc._jvm.org.apache.log4j LOGGER = log4jLogger.LogManager.getLogger(__name__) LOGGER.info("pyspark script logger initialized") # 1 Load data into Spark DataFrame LOG = get_hdfs_filepath('*/*/*') # read text file log_txt_df = sc.wholeTextFiles(LOG).filter(lambda line: line != '').toDF() # Convert strings to columns udf1 = udf(to_utc_timestamp, TimestampType()) df = log_txt_df df = df.select(df._2.alias('line')) udf1 = udf(to_utc_timestamp, TimestampType()) temp = df.select( regexp_extract(col('line'), r'Message-ID:\s<.*>', 0).alias('Message_ID'), regexp_extract( col('line'), r'\d{1,2}\s\w{3}\s\d{4}\s\d{2}:\d{2}:\d{2}\s(\+|\-)\d{4}(.*)', 0).alias("Date"), regexp_extract(col('line'), r'From:\s(.*)', 0).alias("From"), regexp_extract( col('line'), r"To:\s(.+)((?:\n|\r\n?)((?:(?:\n|\r\n?).+)+)){0,}(\S+@\S+)(?:\n|\r\n?)Subject:\s", 0).alias("To"), regexp_extract( col('line'), r"Subject:\s(.+)((?:\n|\r\n?)((?:(?:\n|\r\n?).+)+)){0,}", 1).alias("Subject"), regexp_extract( col('line'), r"Cc:\s(.+)((?:\n|\r\n?)((?:(?:\n|\r\n?).+)+)){0,}(?:\n|\r\n?)Mime-Version:\s", 0).alias("Cc"), regexp_extract(col('line'), r'Mime-Version:\s(.+)', 1).alias("Mime_Version"), regexp_extract(col('line'), r'Content-Type:\s(.*)', 1).alias("Content_Type"), regexp_extract(col('line'), r"Content-Transfer-Encoding:\s(.+)", 1).alias("Content_Transfer_Encoding"), regexp_extract(col('line'), r"X-From:\s(.*)(?:\n|\r\n?)X-To:\s", 0).alias("X_From"), regexp_extract(col('line'), r'X-To:\s(.*)(?:\n|\r\n?)X-cc:\s', 0).alias("X_To"), regexp_extract(col('line'), r'X-cc:\s(.*)(?:\n|\r\n?)X-bcc:\s', 0).alias("X_cc"), regexp_extract(col('line'), r'X-bcc:\s(.*)(?:\n|\r\n?)X-Folder:\s', 0).alias("X_bcc"), regexp_extract(col('line'), r'X-Folder:\s(.*)(?:\n|\r\n?)X-Origin:\s', 0).alias("X_Folder"), regexp_extract(col('line'), r"X-Origin:\s(.*)(?:\n|\r\n?)X-FileName:\s", 0).alias("X_Origin"), regexp_extract(col('line'), r"X-FileName:\s(.*)", 0).alias("X_FileName"), regexp_extract( col('line'), r"X-FileName:\s(.*)((?:\n|\r\n?){1,}(.*)){1,}((?:(?:\n|\r\n?).+)+)", 0).alias("FYI")) #temp.cache() temp1 = temp.select( expr("substring(Message_ID, 14, length(Message_ID)-14)").alias( "Message_ID"), 'Date', udf1('Date').alias('UTC_timestamp'), expr("substring(From, 7, length(From)-6)").alias("From"), expr("substring(To, 5, length(To)-15)").alias("To"), "Subject", expr("substring(Cc, 5, length(Cc)-20)").alias("Cc"), "Mime_Version", "Content_Type", 'Content_Transfer_Encoding', expr("substring(X_From, 9, length(X_From)-16)").alias("X_From"), expr("substring(X_To, 7, length(X_To)-14)").alias("X_To"), expr("substring(X_cc, 7, length(X_cc)-15)").alias("X_cc"), expr("substring(X_bcc, 8, length(X_bcc)-19)").alias("X_bcc"), expr("substring(X_Folder, 11, length(X_Folder)-22)").alias("X_Folder"), expr("substring(X_Origin, 11, length(X_Origin)-24)").alias("X_Origin"), expr("substring(X_FileName, 13, length(X_FileName)-15)").alias( "X_FileName"), regexp_replace( col('FYI'), r"(X-FileName:\s(.*)(?:\n|\r\n?){1,})|(-*Original Message-*(.*)((?:\n|\r\n?){1,}(.*)){0,}((?:(?:\n|\r\n?).+)+))", '').alias('FYI')) #temp1.cache() result = temp1.select( "Message_ID", 'Date', 'UTC_timestamp', "From", regexp_replace(col('To'), r"\r\n\t", "").alias("To"), "Subject", regexp_replace(col('Cc'), r"\r\n\t", "").alias("Cc"), "Mime_Version", "Content_Type", 'Content_Transfer_Encoding', "X_From", "X_To", "X_cc", "X_bcc", "X_Folder", "X_Origin", "X_FileName", regexp_replace(col('FYI'), r"(^\s{1,})|(\n{2,})", '').alias('FYI')) zz = result.limit(5).toPandas() LOGGER.info( "\n\n1.\tLoad data into Spark DataFrame\tDone!\n\n{}\n".format(zz)) # 2 Display the top 10 high-frequency users based on weekly numbers of emails sent df1 = result freq = df1.groupBy('From').agg( (count('UTC_timestamp') / ((max(unix_timestamp(col('UTC_timestamp'))) - min(unix_timestamp(col('UTC_timestamp')))) / 604800)).alias('rate_per_week')).orderBy("rate_per_week", ascending=False) zz = freq.limit(10).toPandas() LOGGER.info( "\n\n2.\tDisplay the top 10 high-frequency users based on weekly numbers of emails sent\tDone!\n\n{}\n" .format(zz)) # 3a Extract top 20 keywords from the subject text for the top 10 high-frequency users top = freq.limit(10) top_subj = df1.join(top, df1["From"] == top["From"], "inner").select(df1['From'], df1['Subject']) top_texts = top_subj.groupBy("From").agg( concat_ws(" ", collect_list("Subject")).alias("texts")) top_texts = top_texts.select('texts').agg( concat_ws(" ", collect_list("texts")).alias("subjects")) # Extract word from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer().setInputCol("subjects").setOutputCol("words") transformed = tokenizer.transform(top_texts) # Extend the stop words dictionary by adding your own stop words such as - # Remove stopwords # custom stopwords stopwords = StopWordsRemover().getStopWords() + ["-", "re:", "", "fw"] remover = StopWordsRemover().setStopWords(stopwords).setInputCol( "words").setOutputCol("filtered") cleaned = remover.transform(transformed) # Extract top 20 keywords by identifying removing the common stop words # Generate features from pyspark.ml.feature import CountVectorizer, CountVectorizerModel cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol( "features").fit(cleaned) featured = cvmodel.transform(cleaned) counts = featured.select('features').collect() a = cvmodel.vocabulary b = counts[0]['features'].values d = {'words': a, 'counts': b} df = pd.DataFrame(d) zz = df.head(20) LOGGER.info( "\n\n3a.\tExtract top 20 keywords from the subject text for the top 10 high-frequency users\tDone!\n\n{}\n" .format(zz)) # 3b Extract top 20 keywords from the subject text for the non-high frequency users w = Window().orderBy(lit('A')) bottom = freq.orderBy("rate_per_week", ascending=False).withColumn("row_num", row_number().over(w)) bottom = bottom.where(col('row_num') > 10).select('From', 'rate_per_week') bottom_subj = df1.join(bottom, df1["From"] == bottom["From"], "inner").select(df1["From"], df1["Subject"]) bottom_texts = bottom_subj.groupBy("From").agg( concat_ws(" ", collect_list("Subject")).alias("texts")) bottom_texts = bottom_texts.select('texts').agg( concat_ws(" ", collect_list("texts")).alias("subjects")) # Extract word tokenizer = Tokenizer().setInputCol("subjects").setOutputCol("words") transformed = tokenizer.transform(bottom_texts) # Remove stopwords # custom stopwords stopwords = StopWordsRemover().getStopWords() + [ "-", "re:", "fw:", "", "&" ] remover = StopWordsRemover().setStopWords(stopwords).setInputCol( "words").setOutputCol("filtered") cleaned = remover.transform(transformed) # Generate features cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol( "features").fit(cleaned) featured = cvmodel.transform(cleaned) counts = featured.select('features').collect() a = cvmodel.vocabulary b = counts[0]['features'].values d = {'words': a, 'counts': b} df = pd.DataFrame(d) zz = df.head(20) LOGGER.info( "\n\n3b.\tExtract top 20 keywords from the subject text for the non-high frequency users\tDone!\n\n{}\n" .format(zz)) # 6 Introduce a new column label to identify new, replied, and forwarded messages df = result def to_label(sbj): l1 = "RE" if sbj.startswith("RE:") else ( "FW" if sbj.startswith("FW:") else 'NEW') return l1 udf2 = udf(to_label, StringType()) df_with_label = df.withColumn('label', udf2("Subject")) zz = df_with_label.limit(5).toPandas() LOGGER.info( "\n\n6.\tIntroduce a new column label to identify new, replied, and forwarded messages\tDone!\n\n{}\n" .format(zz)) # 7 Get the trend of the over mail activity using the pivot table from spark itself pivotDF = df_with_label.groupBy( year("UTC_timestamp").alias('year'), month("UTC_timestamp").alias('month')).pivot("label").count().orderBy( "year", "month") zz = pivotDF.na.fill(0).toPandas() LOGGER.info( "\n\n7.\tGet the trend of the over mail activity using the pivot table from spark itself\tDone!\n\n{}\n" .format(zz)) # 8 Use k-means clustering to create 4 clusters from the extracted keywords raw = result.select("Message_ID", "From", "Subject") # Extract word from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer().setInputCol("Subject").setOutputCol("words") transformed = tokenizer.transform(raw) # Remove stopwords # custom stopwords stopwords = StopWordsRemover().getStopWords() + [ "-", "re:", "fw:", "", "&" ] remover = StopWordsRemover().setStopWords(stopwords).setInputCol( "words").setOutputCol("filtered") cleaned = remover.transform(transformed) cleaned = cleaned.select("Message_ID", "words", "filtered") # Generate features from pyspark.ml.feature import CountVectorizer, CountVectorizerModel cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol( "features").fit(cleaned) featured = cvmodel.transform(cleaned) kmeans = KMeans(k=4, seed=1) # 4 clusters here model = kmeans.fit(featured.select('features')) transformed = model.transform(featured) zz = transformed.limit(5).toPandas() LOGGER.info( "\n\n8.\tUse k-means clustering to create 4 clusters from the extracted keywords\tDone!\n\n{}\n" .format(zz)) # 9 Use LDA to generate 4 topics from the extracted keywords LOGGER.info( "\n\n9.\tUse LDA to generate 4 topics from the extracted keywords\tDone!\n\n{}\n{}\n{}\n{}\n" .format(get_topic(0, transformed), get_topic(1, transformed), get_topic(2, transformed), get_topic(3, transformed)))
#print "loading 20 newsgroups dataset..." tic = time() dataset = fetch_20newsgroups(shuffle=True, random_state=0, remove=('headers','footers','quotes')) train_corpus = dataset.data # a list of 11314 documents / entries toc = time() print ("elapsed time: %.4f sec" %(toc - tic) ) #distribute data corpus_rdd = sc.parallelize(train_corpus) corpus_rdd = corpus_rdd.map(lambda doc: re.sub(r"[^A-Za-z]", " ", doc)) corpus_rdd = corpus_rdd.map(lambda doc: u"".join(doc).encode('utf-8').strip()) rdd_row = corpus_rdd.map(lambda doc: Row(raw_corpus=str(doc))) newsgroups = spark.createDataFrame(rdd_row) tokenizer = Tokenizer(inputCol="raw_corpus", outputCol="tokens") newsgroups = tokenizer.transform(newsgroups) newsgroups = newsgroups.drop('raw_corpus') stopwords = StopWordsRemover(inputCol="tokens", outputCol="tokens_filtered") newsgroups = stopwords.transform(newsgroups) newsgroups = newsgroups.drop('tokens') count_vec = CountVectorizer(inputCol="tokens_filtered", outputCol="tf_features", vocabSize=num_features, minDF=2.0) count_vec_model = count_vec.fit(newsgroups) vocab = count_vec_model.vocabulary newsgroups = count_vec_model.transform(newsgroups) newsgroups = newsgroups.drop('tokens_filtered') #hashingTF = HashingTF(inputCol="tokens_filtered", outputCol="tf_features", numFeatures=num_features) #newsgroups = hashingTF.transform(newsgroups)
from pyspark.ml.feature import StringIndexer from pyspark.ml import Pipeline import numpy as np # Remove rows with null values reviewText_data = csv_data.select('reviewText') reviewText_data = reviewText_data.na.drop() # Register a 'function' to clean text cleantext = spark.udf.register("cleantext", clean_text) # Cleaned reviewText data clean_reviewText_data = reviewText_data.select(cleantext("reviewText").alias("reviewText")) # Convert sentences into discrete words tokenizer = Tokenizer(inputCol="reviewText", outputCol="words") # Calculate term frequency for each word tf = CountVectorizer(inputCol="words", outputCol="tf", vocabSize=2**6, minDF=0.05, minTF=1) # Calculate IDF given the term frequency idf = IDF(inputCol='tf', outputCol="features", minDocFreq=1) #minDocFreq: remove sparse terms # Fit the cleaned reviewText data through the pipeline pipeline = Pipeline(stages=[tokenizer, tf, idf]) pipelineFit = pipeline.fit(clean_reviewText_data) train_df = pipelineFit.transform(clean_reviewText_data) # Save TF_IDF as text files in datanodes train_df.rdd.saveAsTextFile("hdfs://ec2-34-239-131-131.compute-1.amazonaws.com:9000/output15/")
def lower_text(line): word_list=re.findall('[\w_]+', line.lower()) return ' '.join(map(str, word_list)) filter_data_withColumn = filter_data.withColumn("text_lower", udf(lower_text, StringType())("Text")).select('text_lower','Score') #Showing the result filter_data_withColumn.show(15) # # Tokenize # In[11]: tokenize = Tokenizer(inputCol="text_lower", outputCol="words") words_Data_Frame = tokenize.transform(filter_data_withColumn) words_Data_Frame.take(5) # # Remove Stopword # In[12]: remove = StopWordsRemover(inputCol="words", outputCol="filtered_words") words_Data_Frame1 = remove.transform(words_Data_Frame).select("filtered_words","Score") words_Data_Frame1.show(5) # # Stemming
spark = SQLContext(sc) #LOADING DATA FROM HDFS TO SPARK DATAFRAME df0=spark.read.option("sep", "\t").option('header',True).csv('hdfs://192.168.50.93:9000/user/hadoop/books2/amazon_reviews_us_Wireless_v1_00.tsv') df0.printSchema() #FILTERING FOR EMPTY VALUES df01 = df0.filter((col("review_body").isNotNull()) & (col("verified_purchase").isNotNull())) #ENCODING LABEL stage_string = StringIndexer(inputCol="verified_purchase", outputCol="class_res") ppl = Pipeline(stages=[stage_string]) df1 = ppl.fit(df01).transform(df01) #CREATING TF_IDF tokenizer = Tokenizer(inputCol="review_body", outputCol="words") wordsData = tokenizer.transform(df1) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) #NAIVEBAYES nb = NaiveBayes(featuresCol="features", labelCol="class_res") #Model training model = nb.fit(rescaledData) #Model Saving model.write().overwrite().save("./NB_model")
trainingCount=parts.count() f = indexedTweets.map(lambda p: Row(tindex=int(p[1]),tweet=p[0][0], label= int(float(p[0][1])), training=1)) #f = parts.map(lambda p: Row(tweet=p[0],label=int(p[1]))) linest = sc.textFile("/home/ankita/MLProject/SVM/GroundTruth.txt") partst = linest.map(lambda l: l.split(",")) indexedTweetst = partst.zipWithIndex().map(lambda (a,b): (a,b+trainingCount)) ft = indexedTweetst.map(lambda p: Row(tindex=int(p[1]),tweet=p[0][1], label= int(float(p[0][0])),training=0)) alldata = f.union(ft) schemaTweets = sqlContext.createDataFrame(alldata) schemaTweets.registerTempTable("data") tokenizer = Tokenizer(inputCol="tweet", outputCol="words") wordsData = tokenizer.transform(schemaTweets) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) #rescaledData.collect() wordsvectors = rescaledData.filter(rescaledData.training==1)["label","features"].map(lambda row: LabeledPoint(row[0], row[1])) model = LogisticRegressionWithLBFGS.train(wordsvectors, iterations=100)
u'more', u'most', u'must', u'my', u'myself', u'no', u'nor', u'not', u'now', u'o', u'of', u'off', u'on', u'once', u'only', u'or', u'other', u'our', u'ours', u'ourselves', u'out', u'over', u'own', u'r', u're', u's', 'said', u'same', u'she', u'should', u'shouldnt', u'so', u'some', u'such', u't', u'than', u'that', 'thats', u'the', u'their', u'theirs', u'them', u'themselves', u'then', u'there', u'these', u'they', u'this', u'those', u'through', u'to', u'too', u'under', u'until', u'up', u'very', u'was', u'wasnt', u'we', u'were', u'werent', u'what', u'when', u'where', u'which', u'while', u'who', u'whom', u'why', u'will', u'with', u'wont', u'would', u'y', u'you', u'your', u'yours', u'yourself', u'yourselves' ] stopwordsRemover = StopWordsRemover( inputCol="words1", outputCol="filtered").setStopWords(add_stopwords) tokenizer = Tokenizer(inputCol="Text", outputCol="tokens") hashtf = HashingTF(numFeatures=2**16, inputCol="filtered", outputCol='tf') idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms # bag of words count #hashtf = HashingTF(numFeatures=2**16, inputCol="tokens", outputCol='tf') # #idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms # # pipeline = Pipeline( stages=[regexTokenizer, stopwordsRemover, tokenizer, hashtf, idf])
df_review = df_review.filter("cool >=3 or useful >=3 or funny >=3") df_review = df_review.select("stars", "text") df_review = df_review.repartition(100) from pyspark.sql import functions as F df_review = df_review.withColumn("target", F.when(df_review.stars <= 2, 1).otherwise(0)) df_review.cache() (train_set, test_set) = df_review.randomSplit([0.7, 0.3], seed=1002) from pyspark.ml.feature import HashingTF, IDF, Tokenizer from pyspark.ml.feature import StringIndexer from pyspark.ml import Pipeline tokenizer = Tokenizer(inputCol="text", outputCol="words") cv = CountVectorizer(vocabSize=2**16, inputCol="words", outputCol='cv') idf = IDF(inputCol='cv', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms label_stringIdx = StringIndexer(inputCol="target", outputCol="label") #lr = LogisticRegression(maxIter=10) pipeline = Pipeline(stages=[tokenizer, cv, idf, label_stringIdx]) pipelineFit = pipeline.fit(train_set) train_df = pipelineFit.transform(train_set) val_df = pipelineFit.transform(test_set) lr = LogisticRegression(maxIter=100) lrModel = lr.fit(train_df) predictions = lrModel.transform(val_df) #predictions = predictions.select('target','label', 'rawPrediction', 'probability', 'prediction')
except KafkaError as ke: logger.debug('Fail to start kafka producer, caused by %s' % ke.message) try: # Create dstream from kafka topic directKafkaStream = KafkaUtils.createDirectStream(ssc, kafka_topic, {'metadata.broker.list' = broker_ip}) logger.debug('Create direct dstream from kafka successfully') except: logger.debug('Unable to create dstream from kafka') atexit.register(shutdown_hook, kafka_producer, spark) # Load in idf_model, nb_model, hashing_tf, idf_model and tag_catId map try: logger.debug('Loading models') tokenizer = Tokenizer.load(tokenizer_file) hashing_tf = HashingTF.load(hashing_tf_file) idf_model = IDFModel.load(idf_model_file) nb_model = NaiveBayesModel.load(nb_model_file) selected_tags = pd.read_csv(selected_tags_file, header=None) local_catId_to_tags = dict(zip(list(selected_tags.index), selected_tags[0])) local_tags_to_catId=dict(zip(selected_tags[0], list(selected_tags.index))) catId_to_tags = sc.broadcast(local_catId_to_tags) tags_to_catId = sc.broadcast(local_tags_to_catId) tags_to_catId_transform = udf(lambda tag: float(tags_to_catId.value[tag]), FloatType()) catId_to_tags_transform = udf(lambda catId: catId_to_tags.value[catId], StringType()) logger.debug('loaded models successfully') except: logger.debug('Fail to load models')
def transform(spark, s3_input_data, s3_output_train_data, s3_output_validation_data, s3_output_test_data): print('Processing {} => {}'.format(s3_input_data, s3_output_train_data, s3_output_validation_data, s3_output_test_data)) schema = StructType([ StructField('marketplace', StringType(), True), StructField('customer_id', StringType(), True), StructField('review_id', StringType(), True), StructField('product_id', StringType(), True), StructField('product_parent', StringType(), True), StructField('product_title', StringType(), True), StructField('product_category', StringType(), True), StructField('star_rating', IntegerType(), True), StructField('helpful_votes', IntegerType(), True), StructField('total_votes', IntegerType(), True), StructField('vine', StringType(), True), StructField('verified_purchase', StringType(), True), StructField('review_headline', StringType(), True), StructField('review_body', StringType(), True), StructField('review_date', StringType(), True) ]) df_csv = spark.read.csv(path=s3_input_data, sep='\t', schema=schema, header=True, quote=None) df_csv.show() # This dataset should already be clean, but always good to double-check print('Showing null review_body rows...') df_csv.where(col('review_body').isNull()).show() df_csv_cleaned = df_csv.na.drop(subset=['review_body']) df_csv_cleaned.where(col('review_body').isNull()).show() tokenizer = Tokenizer(inputCol='review_body', outputCol='words') wordsData = tokenizer.transform(df_csv_cleaned) hashingTF = HashingTF(inputCol='words', outputCol='raw_features', numFeatures=1000) featurizedData = hashingTF.transform(wordsData) # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: # 1) compute the IDF vector # 2) scale the term frequencies by IDF # Therefore, we cache the result of the HashingTF transformation above to speed up the 2nd pass featurizedData.cache() # spark.mllib's IDF implementation provides an option for ignoring terms # which occur in less than a minimum number of documents. # In such cases, the IDF for these terms is set to 0. # This feature can be used by passing the minDocFreq value to the IDF constructor. idf = IDF(inputCol='raw_features', outputCol='features') #, minDocFreq=2) idfModel = idf.fit(featurizedData) features_df = idfModel.transform(featurizedData) features_df.select('star_rating', 'features').show() num_features = 300 pca = PCA(k=num_features, inputCol='features', outputCol='pca_features') pca_model = pca.fit(features_df) pca_features_df = pca_model.transform(features_df).select( 'star_rating', 'pca_features') pca_features_df.show(truncate=False) standard_scaler = StandardScaler(inputCol='pca_features', outputCol='scaled_pca_features') standard_scaler_model = standard_scaler.fit(pca_features_df) standard_scaler_features_df = standard_scaler_model.transform( pca_features_df).select('star_rating', 'scaled_pca_features') standard_scaler_features_df.show(truncate=False) expanded_features_df = (standard_scaler_features_df.withColumn( 'f', to_array(col('scaled_pca_features'))).select( ['star_rating'] + [col('f')[i] for i in range(num_features)])) expanded_features_df.show() train_df, validation_df, test_df = expanded_features_df.randomSplit( [0.9, 0.05, 0.05]) train_df.write.csv(path=s3_output_train_data, header=None, quote=None) print('Wrote to output file: {}'.format(s3_output_train_data)) validation_df.write.csv(path=s3_output_validation_data, header=None, quote=None) print('Wrote to output file: {}'.format(s3_output_validation_data)) test_df.write.csv(path=s3_output_test_data, header=None, quote=None) print('Wrote to output file: {}'.format(s3_output_test_data))
def tokenize(inputDF): tokenizer = Tokenizer(inputCol='sentences', outputCol='tokenizedwords') tokenized = tokenizer.transform(inputDF) return tokenized
train_datafile = get_args().input train_df = spark.read.csv(train_datafile,header=True,sep='\t').limit(80000) # using 1000 records as a small set debugging data train_sents1 = train_df.select('genre', 'sentence1') train_sents2 = train_df.select('genre', 'sentence2') # train_sents1.show(5) udf_lower = F.udf(lower_folding, StringType() ) train_sents1_lower = train_sents1.withColumn('lower_sents', udf_lower('sentence1') ) # train_sents1_lower.show(5) udf_rv_punc = F.udf(remove_punctuation_re, StringType() ) train_sents1_rv_punc = train_sents1_lower.withColumn('rv_punc_sents', udf_rv_punc('lower_sents') ) tokenizer = Tokenizer(inputCol="rv_punc_sents", outputCol="tokens") remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens") w2v = Word2Vec(vectorSize=300, minCount=0, inputCol="filtered_tokens", outputCol="avg_word_embed") doc2vec_pipeline = Pipeline(stages=[tokenizer,remover,w2v]) doc2vec_model = doc2vec_pipeline.fit(train_sents1_rv_punc) doc2vecs_df = doc2vec_model.transform(train_sents1_rv_punc) w2v_train_df, w2v_test_df = doc2vecs_df.randomSplit([0.8, 0.2]) from pyspark.ml.feature import StringIndexer from pyspark.ml.classification import MultilayerPerceptronClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator genre2label = StringIndexer(inputCol="genre", outputCol="label") rf_classifier = MultilayerPerceptronClassifier(labelCol="label", featuresCol="avg_word_embed")
lines=sc.textFile("/Users/admin/Desktop/KBSApp/KBSApp/permissionsData/dataSets/SVMDataSet.txt") parts = lines.map(lambda l: l.split(",")) f = parts.map(lambda p: Row(tindex=int(p[0]),packageName=p[1],packagePermissions=p[2], label= int(float(p[3])),training=1)) linest = sc.textFile("/Users/admin/Desktop/KBSApp/KBSApp/permissionsData/dataSets/SVMDataGroundTruth.txt") partst = linest.map(lambda l: l.split(",")) ft = partst.map(lambda p: Row(tindex=int(p[0]),packageName=p[1],packagePermissions=p[2],label= int(float(p[3])),training=0)) alldata = f.union(ft) schemaApp = sqlContext.createDataFrame(alldata) schemaApp.registerTempTable("data") tokenizer = Tokenizer(inputCol="packagePermissions", outputCol="perms") permsData = tokenizer.transform(schemaApp) hashingTF = HashingTF(inputCol="perms", outputCol="rawFeatures") featurizedData = hashingTF.transform(permsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) wordsvectors = rescaledData["label","features"].map(lambda row: LabeledPoint(row[0], row[1])) model = LogisticRegressionWithLBFGS.train(wordsvectors, iterations=100)
# Count number of Words in each Text from pyspark.sql.functions import length data = data.withColumn('length', length(data['text'])) data.show() # In[3]: # Compare the lenght difference between ham and spam data.groupby('class').mean().show() # In[4]: # Treat TF-IDF features for each text # TF: Term Frequency # IDF: Inverse Document Frequency from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer, VectorAssembler tokenizer = Tokenizer(inputCol="text", outputCol="token_text") stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens') count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec') idf = IDF(inputCol="c_vec", outputCol="tf_idf") ham_spam_to_num = StringIndexer(inputCol='class',outputCol='label') final_feature = VectorAssembler(inputCols=['tf_idf', 'length'],outputCol='features') from pyspark.ml import Pipeline data_prep_pipe = Pipeline(stages=[ham_spam_to_num,tokenizer,stopremove,count_vec,idf,final_feature]) clean_data = data_prep_pipe.fit(data).transform(data) clean_data.show() clean_data.take(1) clean_data.take(1)[0][-1]
from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("TfIdfExample")\ .getOrCreate() # $example on$ sentenceData = spark.createDataFrame([ (0, "Hi I heard about Spark"), (0, "I wish Java could use case classes"), (1, "Logistic regression models are neat") ], ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) # alternatively, CountVectorizer can also be used to get term frequency vectors idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) for features_label in rescaledData.select("features", "label").take(3): print(features_label) # $example off$ spark.stop()
def create_w2v_model(): spark = SparkSession \ .builder \ .appName("SimpleApplication") \ .config("spark.executor.memory", "2g") \ .config("spark.driver.memory", "2g") \ .config("spark.memory.offHeap.enabled", True) \ .config("spark.memory.offHeap.size", "2g") \ .getOrCreate() input_file = spark.sparkContext.wholeTextFiles(PATH) print(""" Подготовка данных (1)... """) prepared_data = input_file.map(lambda x: (x[0], remove_punctuation(x[1]))) print(""" Подготовка данных (2)... """) df = prepared_data.toDF() print(""" Подготовка данных (3)... """) prepared_df = df.selectExpr('_2 as text') print(""" Разбитие на токены... """) tokenizer = Tokenizer(inputCol='text', outputCol='words') words = tokenizer.transform(prepared_df) print(""" Очистка от стоп-слов... """) stop_words = StopWordsRemover.loadDefaultStopWords('russian') remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stop_words) print(""" Построение модели... """) word2Vec = Word2Vec(vectorSize=50, inputCol='words', outputCol='result', minCount=2) model = word2Vec.fit(words) print(""" Сохранение модели... """) today = datetime.datetime.today() model_name = today.strftime("model/kurs_model") print(""" Model """ + model_name + """ saved """) model.save(model_name) spark.stop()
# COMMAND ---------- summary = model.summary print model.weights model.gaussiansDF.show() summary.cluster.show() summary.clusterSizes summary.probability.show() # COMMAND ---------- from pyspark.ml.feature import Tokenizer, CountVectorizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn.transform(sales.drop("features")) cv = CountVectorizer()\ .setInputCol("DescOut")\ .setOutputCol("features")\ .setVocabSize(500)\ .setMinTF(0)\ .setMinDF(0)\ .setBinary(True) cvFitted = cv.fit(tokenized) prepped = cvFitted.transform(tokenized) # COMMAND ---------- from pyspark.ml.clustering import LDA
for sentence in review: word_tokens = word_tokenize(sentence) for w in word_tokens: if w not in stop_words: w = ps.stem(w) final = final + " " + w filtered_sentence.append(final) final = "" review = filtered_sentence # print("\n \n -----------------------------------------------------------------------------------------------------: \n " + str(review)) # doing the bag of words algorithm dup_vector = zip(calification, review) sentenceData = spark.createDataFrame(dup_vector, ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF( inputCol="words", outputCol="rawFeatures", numFeatures=100 ) # numFeatures is the distincts diferents words that there are in the document, would be a good thing to do a wordcount here featurizedData = hashingTF.transform(wordsData) # alternatively, CountVectorizer can also be used to get term frequency vectors idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) # rescaledData.select("label", "features").show(20,False) # to show dataframe structure # print(len(review)) # printing the size of both arrays for indexing acknolegement # print(len(calification)) #print(review) # just to test what does the array have
import argparse from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext, Row from pyspark.ml.feature import Tokenizer def filter_comments(df): return df.filter(df['author'] != '[deleted]') \ .filter(df['body'] != '[deleted]') \ .filter(df['body'] != '[removed]') if __name__ == '__main__': parser = argparse.ArgumentParser(description='Reddit Comment Prediction') parser.add_argument('-i', '--input_file', type=str, help="""The CSV input data file that contains the raw comment data""") args = parser.parse_args() sc = SparkContext("local", "Prediction") sqlContext = SQLContext(sc) df = sqlContext.read.json(args.input_file) print 'Loaded input file {} with {} total comments'.format(args.input_file, df.count()) filtered = filter_comments(df) print '{} comments after filtering'.format(filtered.count()) tokenizer = Tokenizer(inputCol="body", outputCol="words") wordsDataFrame = tokenizer.transform(filtered) wordsDataFrame.select("body", "words").show()
def clustering(self, columns='*', num_cluster=2, n_g=2): """ input @ a column list and remove special char column @ number of cluster @ number of n_gram return: @ a data frame with clustering column """ # let text data as string with blank space n_gr = n_g data_frame_1 = self._df # check data type, if not string turn it into string valid_cols = [col for (col, typ) in filter(lambda typ: typ[1] == 'string', self._df.dtypes)] # turn it into string if columns not in valid_cols: data_frame_1 = data_frame_1.withColumn(columns + '_', data_frame_1[columns].cast("string")) data_frame_1 = data_frame_1.drop(columns) data_frame_1 = data_frame_1.withColumnRenamed(columns + '_', columns) # make the string toknizable udf_space = udf(lambda z: " ".join(z)) data_frame_1 = data_frame_1.withColumn(columns + '_split', udf_space(columns)).orderBy(columns) # Token the word and do the bi-gram tokenizer = Tokenizer(inputCol=columns + '_split', outputCol=columns + "_token") data_frame_2 = tokenizer.transform(data_frame_1) # make features like n-gram ngram = NGram(n=n_gr, inputCol=columns + "_token", outputCol=columns + "_ngram") ngramDataFrame = ngram.transform(data_frame_2) # vectorization: text map to vector cv = CountVectorizer(inputCol=columns + "_ngram", outputCol="features", vocabSize=10, minDF=1.0) # fit the vectorization model = cv.fit(ngramDataFrame) result = model.transform(ngramDataFrame) # setup kmeans kmeans = KMeans().setK(num_cluster).setSeed(1) model_kmean = kmeans.fit(result) predictions_kmean = model_kmean.transform(result) df = predictions_kmean.orderBy('prediction', ascending=True).select(self._df.schema.names + ['prediction']) # reshape the table, user are easy to read it print('show each count numbers in this row') temp = df.groupBy(columns, 'prediction').count() temp = temp.withColumnRenamed('prediction', 'cluster') df = df.withColumnRenamed('prediction', 'cluster') temp = temp.withColumnRenamed('count', 'count in cluster') temp.show() # show the cluster number window = Window.partitionBy("cluster").orderBy(col("count in cluster").desc()) test = (temp.withColumn('row_num', F.row_number().over(window)).where(F.col('row_num') == 1).select(columns, 'cluster')) print('Defult replace: replace the mode of a instance each cluster') test.orderBy('cluster', ascending=True).show() # turn the modest number to list test_list = test.select(columns).orderBy('cluster').collect() # name_list = [i.columns for i in test_list] name_list = [i[columns] for i in test_list] list_setting = input("Type 'yes' to enter customized replace words or press any key for default replace setting: \n") # let the usr defined the replaced word count = 0 if list_setting == 'yes': count = 0 while(count < num_cluster): usr_replace = input('Enter the cluster {0} shoud be, or press enter to skip: \n'.format(count)) if usr_replace != '': name_list[count] = usr_replace else: name_list[count] = name_list[count] count += 1 # replace the words udf_place_name = udf(lambda z: name_list[z]) data_frame_replace = df.withColumn('replace_' + columns, udf_place_name('cluster')) replace_input = input('type yes to replace origin column, press any key to keep the origin column:\n') # change the origin dataframe if replace_input == 'yes': data_frame_replace = data_frame_replace.drop(columns) data_frame_replace = data_frame_replace.withColumnRenamed("replace_" + columns, columns) data_frame_replace = data_frame_replace.drop('cluster') # replace the origin dataframe self._df = data_frame_replace # show result to users self._df.show() return self
def cleanLower(doc): return doc.replace("<br /><br />"," ").lower() rdd = labeledRdd.map(lambda doc : (cleanLower(doc[0]),doc[1])) print "Text is cleaned" sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ['review', 'label']) dfTrain, dfTest = df.randomSplit([0.8,0.2]) print "Random split is done" tokenizer = Tokenizer(inputCol='review', outputCol='reviews_words') hashing_tf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='reviews_tf') idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10) pipeline = Pipeline(stages=[tokenizer, hashing_tf, idf, string_indexer, dt]) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') # grid=(ParamGridBuilder() # .baseOn([evaluator.metricName,'precision'])
from pyspark.sql import SQLContext from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.tuning import CrossValidator, ParamGridBuilder conf = SparkConf().setAppName("MLPipeline") sc = SparkContext(conf=conf) # Read training data as a DataFrame sqlCt = SQLContext(sc) trainDF = sqlCt.read.parquet("20news_train.parquet") # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features", numFeatures=1000) lr = LogisticRegression(maxIter=20, regParam=0.1) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training data. model = pipeline.fit(trainDF) #Building the cross validation model paramGrid = (ParamGridBuilder().addGrid( hashingTF.numFeatures, [1000, 5000, 10000]).addGrid( lr.regParam, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]).build()) crossval = CrossValidator(estimator=pipeline,
df2 = df.withColumn('date', F.unix_timestamp('date', form).cast('timestamp')) print(df2.show(5)) df = df2 import matplotlib.pyplot as plt dates = df.select(F.date_format( 'date', 'yyyy-MM-dd').alias('no_timestamp')).groupby('no_timestamp').count().sort( F.col('no_timestamp')) print(dates.show(dates.count())) dates.toPandas().plot(kind='line', x='no_timestamp', y='count') dates.toPandas().plot(kind='bar', x='no_timestamp') tokenizer = Tokenizer(inputCol="tweet", outputCol="words") prep_df = tokenizer.transform(df) cv_prep = CountVectorizer(inputCol="words", outputCol="prep") cv_model = cv_prep.fit(prep_df) ready_df = cv_model.transform(prep_df) # stopWords = [word for word in cv_prep.vocabulary if any(char.isdigit() for char in word)] # remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords = stopwords) # prep_df = remover.transform(prep_df) trainable = ready_df.select( 'tweet_id', 'prep').rdd.map(lambda x, y: [x, Vectors.fromML(y)]).cache() print("Trainable") print(trainable.take(10)) print("take") model = LDA.train(trainable, k=5, seed=1, optimizer="online") exit(0)
from pyspark.sql import SparkSession if __name__ == "__main__": sparkSession = SparkSession\ .builder\ .getOrCreate() # Prepare training documents from a list of (id, text, label) tuples. training = sparkSession.createDataFrame([(0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0)], ["id", "text", "label"]) # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and logistic regression. tokenizer = Tokenizer(inputCol="text", outputCol="words") tk = tokenizer.transform(training) tk.printSchema() tk.show() hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") hs = hashingTF.transform(tk) hs.printSchema() hs.show() logistic_regression = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, logistic_regression])
return list() return [lemmtizer.lemmatize(word) for word in input_list] spark = SparkSession.builder.appName("TfIdf-Lemmetization").getOrCreate() lemmetize = F.udf(lemmetize) # spark.udf.register("lemmetize", lemmetize) documents = spark.read.text("dataset/*.txt") documents = documents.withColumn("doc_id", F.row_number().over(Window.orderBy('value'))) documents.printSchema() # creating tokens/words from the sentence data tokenizer = Tokenizer(inputCol="value", outputCol="words") wordsData = tokenizer.transform(documents) wordsData.show() stemmer = SnowballStemmer(language='english') stemmer_udf = F.udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType())) wordsData = wordsData.withColumn("lemms", stemmer_udf("words")) # Learn a mapping from words to Vectors. word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="lemms", outputCol="result") model = word2Vec.fit(wordsData) result = model.transform(wordsData)
dataSet = dataSet.withColumn('cleanReview', cleanText( F.col('reviews'))).filter(F.col('cleanReview') != '') dataSet.show() # %% dataSet = dataSet.withColumn('class', dataSet['class'].cast(IntegerType())) dataSet = dataSet.select('class', 'cleanReview').withColumnRenamed( 'cleanReview', 'reviews') # %% trainDF, testDF = dataSet.randomSplit([0.8, 0.2]) trainDF.show() testDF.show() # %% tokenizer = Tokenizer(inputCol="reviews", outputCol="tokens") countVector = CountVectorizer(inputCol=tokenizer.getOutputCol(), outputCol='features') idf = IDF(inputCol=countVector.getOutputCol(), outputCol='idf') pipeline = Pipeline(stages=[tokenizer, countVector, idf]) pipelineModel = pipeline.fit(trainDF) # %% pTrainDF = pipelineModel.transform(trainDF) pTestDF = pipelineModel.transform(testDF) # %% evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="f1") lr = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol='class')
from pyspark.sql import SQLContext from pyspark.sql.functions import desc, explode from pyspark.sql.types import * from storage import Sqlite PARTITIONS = 500 THRESHOLD = 50 if __name__ == "__main__": conf = SparkConf().setAppName("reddit") conf.set('spark.serializer', 'org.apache.spark.serializer.KryoSerializer') conf.set('spark.local.dir', '/mnt/work') conf.set('spark.driver.maxResultSize', '12g') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) fields = [StructField("subreddit", StringType(), True), StructField("body", StringType(), True)] rawDF = sqlContext.read.json("file:///mnt/s3/2015/*", StructType(fields)) # split comments into words tokenizer = Tokenizer(inputCol="body", outputCol="words") wordsDataFrame = tokenizer.transform(rawDF) remover = StopWordsRemover(inputCol="words", outputCol="filtered") filteredDataFrame = remover.transform(wordsDataFrame) # explode terms into individual rows termDataFrame = filteredDataFrame.select(['subreddit', explode(filteredDataFrame.filtered).alias("term")]) # group by subreddit and term, then count occurence of term in subreddits countsDataFrame = termDataFrame.groupBy(['subreddit', 'term']).count() db = Sqlite() countsDataFrame.select(['subreddit', 'term', 'count']).filter('count > {}'.format(THRESHOLD)).foreachPartition(db.saveSubredditWords)
labeledRdd = sc.parallelize(labeledData) from pyspark.sql import SQLContext def preProcess(doc): clean = doc.replace("<br /><br />"," ") return clean.lower() rdd = labeledRdd.map(lambda doc : (preProcess(doc[0]),doc[1])) sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ['review', 'label']) dfTrain, dfTest = df.randomSplit([0.8,0.2]) from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer(inputCol='review', outputCol='words') dfTrainTok = tokenizer.transform(dfTrain) import itertools lists=dfTrainTok.map(lambda r : r.review).collect() dictWords=set(itertools.chain(*lists)) dictionaryWords={} for i,word in enumerate(dictWords): dictionaryWords[word]=i dict_broad=sc.broadcast(dictionaryWords) from pyspark.mllib.linalg import SparseVector def vectorize(row,dico): vector_dict={} for w in row.words:
data = df6.select( 'id', (lower(regexp_replace('comment_text', "[^a-zA-Z\\s]", "")).alias('text')), 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate') data = data.select('id', (regexp_replace('text', "[\r\n]+", "").alias('text')), 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate') data.na.drop() data.na.fill(0) clean = data.where(col('toxic').isNotNull()).where( col('severe_toxic').isNotNull()).where(col('obscene').isNotNull()).where( col('threat').isNotNull()).where(col('insult').isNotNull()).where( col('identity_hate').isNotNull()) # Token Parser tokenizer = Tokenizer(inputCol="text", outputCol="words") wordToken = tokenizer.transform(clean) # Delete Stop Words remover = StopWordsRemover(inputCol='words', outputCol='words_clean') dataFrameNoStop = remover.transform(wordToken) # Term Frequency hashTermFreq = HashingTF(inputCol="words_clean", outputCol="rawFeatures") termFreq = hashTermFreq.transform(dataFrameNoStop) # Term Frequency ID Frequency idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(termFreq) tfidf = idfModel.transform(termFreq).select('features', 'toxic', 'severe_toxic', 'obscene',
review_text = BeautifulSoup(raw_review).text # # 2. Remove non-letters letters_only = re.sub("[^a-zA-Z]", " ", review_text) # # 3. Convert to lower case, split into individual words words = letters_only.lower().split() # # 4. Remove stop words meaningful_words = [w for w in words if not w in stops] # # 5. Join the words back into one string separated by space, # and return the result. return " ".join( meaningful_words) stops = set(stopwords.words("english")) lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv") rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys() parts = rows.map(lambda l: l.split("\t")) review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), review=review_to_words(p[2]))) schemeReview = sqlContext.createDataFrame(review) tokenizer = Tokenizer(inputCol="review", outputCol="words") wordsData = tokenizer.transform(schemeReview) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) selectData = rescaledData.select("label","features")
print('\n', sms.dtypes, '\n') sms.printSchema() # Remove punctuation (REGEX provided) and numbers wrangled = sms.withColumn('text', regexp_replace(sms.text, '[_():;,.!?\\-]', ' ')) wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, '[0-9]', ' ')) # Merge multiple spaces wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +', ' ')) # Text to tokens wrangled = Tokenizer(inputCol="text", outputCol="words").transform(wrangled) # Remove stop words. wrangled = StopWordsRemover(inputCol="words", outputCol="terms").transform(wrangled) # Apply the hashing trick wrangled = HashingTF(inputCol="terms", outputCol="hash", numFeatures=1024).transform(wrangled) # Convert hashed symbols to TF-IDF sms = IDF(inputCol="hash", outputCol="features").fit(wrangled).transform(wrangled) # View the first four records sms.show(4, truncate=False)
# limitations under the License. # from __future__ import print_function from pyspark import SparkContext from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import Tokenizer, RegexTokenizer # $example off$ if __name__ == "__main__": sc = SparkContext(appName="TokenizerExample") sqlContext = SQLContext(sc) # $example on$ sentenceDataFrame = sqlContext.createDataFrame([ (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") ], ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsDataFrame = tokenizer.transform(sentenceDataFrame) for words_label in wordsDataFrame.select("words", "label").take(3): print(words_label) regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W") # alternatively, pattern="\\w+", gaps(False) # $example off$ sc.stop()
auc = binary_evaluator.evaluate(prediction, {binary_evaluator.metricName: "areaUnderROC"}) # Turning Text into Tables # term-document text # A selection of children's books books.show(truncate=False) # Removing punctuation from pyspark.sql.functions import regexp_replace # Regular expression (REGEX) to match commas and hyphens REGEX = '[,\\-]' books = books.withColumn('text', regexp_replace(books.text, REGEX, ' ')) # Text to tokens from pyspark.ml.feature import Tokenizer books = Tokenizer(inputCol='text', outputCol='tokens').transform(books) # Remove stop words from pyspark.ml.feature import StopWordsRemover stopwords = StopWordsRemover() # Take a look at the list of stop words stopwords.getStopWords() # Spevify the input and output column names stopwords = stopwords.setInputCol('tokens').setOutputCol('words') books = stopwords.transform(books) # Feature hashing from pyspark.ml.feature import HashingTF hasher = HashingTF(inputCol='words', outputCol='hash', numFeatures=32) books = hasher.transoform(books)
print "Create dataframe" t0 = time() df = sqlContext.createDataFrame(rdd, ['review', 'label']) print "Showing first example : " print print df.first() tt = time() - t0 print print "Dataframe created in {} second".format(round(tt,3)) # In[314]: from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer(inputCol='review', outputCol='words') dfTok = tokenizer.transform(df) # In[315]: from pyspark.ml.feature import NGram bigram = NGram(inputCol="words", outputCol="bigrams") dfBigram = bigram.transform(dfTok) # In[317]: print "Start tokenizing, computing bigrams and splitting between test and train" t0 = time() dfTrain, dfTest = dfBigram.randomSplit([0.8,0.2])
from pyspark.ml.feature import Tokenizer from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .appName("tokenizer_sample") \ .master("local[*]") \ .getOrCreate() data = [(0, "Tokenization is the process"), (1, "Refer to the Tokenizer")] inputDF = spark.createDataFrame(data).toDF("id", "input") tokenizer = Tokenizer(inputCol="input", outputCol="output") outputDF = tokenizer.transform(inputDF) outputDF.printSchema() outputDF.show() spark.stop
for category_dir in listdir(input_dir): # Build the dataset of (docname, category, wordcounts) tuples distinct_labels[curr_cat] = category_dir next_docs = sc.wholeTextFiles(('/').join([input_dir, category_dir])) docs = docs.union(next_docs.map(lambda (doc, lines): (format_text(lines), float(curr_cat)))) curr_cat += 1 training_rows = docs.sample(False, train_fraction) testing_rows = docs.subtract(training_rows) # Prepare training and test documents, which are labeled. LabeledDocument = Row("text", "label") train = training_rows.map(lambda x: LabeledDocument(*x)).toDF() test = testing_rows.map(lambda x: LabeledDocument(*x)).toDF() # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures") #outputCol="features") idf = IDF(inputCol="rawFeatures", outputCol="features") lr = LogisticRegression(maxIter=1000, regParam=0.001) #pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) p0 = Pipeline(stages=[tokenizer, hashingTF, idf ,lr]) #m0 = p0.fit(train) #pipeline = Pipeline(stages=[m0, lr]) pipeline = p0 # Fit the pipeline to training documents. model = pipeline.fit(train) print('\n\n --------------- RESULT ----------------------\n\n') print(model.transform(test).head()) print('\n\n ---------------------------------------------\n\n')
from __future__ import print_function from pyspark.ml.feature import Tokenizer, RegexTokenizer from pyspark.sql.functions import col, udf from pyspark.sql.types import IntegerType from pyspark.sql import SparkSession if __name__ == '__main__': spark = SparkSession\ .builder\ .appName('Tokenizer')\ .getOrCreate() sentenceDataFrame = spark.createDataFrame([ (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat") ], ["id", "sentence"]) tokenizer = Tokenizer(inputCol = 'sentence', outputCol='words') regexTokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern = "\\W") countTokens = udf(lambda words: len(words), IntegerType()) tokenized = tokenizer.transform(sentenceDataFrame) tokenized.select('sentence','words')\ .withColumn('tokens', countTokens(col('words'))).show(truncate=False) regexTokenized = regexTokenizer.transform(sentenceDataFrame) regexTokenized.select('sentence', 'words') \ .withColumn('tokens', countTokens(col('words'))).show(truncate=False) spark.stop