def fit_kmeans(spark, products_df): step = 0 step += 1 tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer") step += 1 stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords") step += 1 tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16) step += 1 idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf") step += 1 normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer") step += 1 kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20) kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans]) model = kmeans_pipeline.fit(products_df) words_prediction = model.transform(products_df) model.save("./kmeans") # the whole machine learning instance is saved in a folder return model, words_prediction
def train_lg(training_data, collection): # Configure an ML pipeline, which consists of the following stages: hashingTF, idf, and lr. hashingTF = HashingTF(inputCol="filtered", outputCol="TF_features") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") pipeline1 = Pipeline(stages=[hashingTF, idf]) # Fit the pipeline1 to training documents. model1 = pipeline1.fit(training_data) lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) pipeline2 = Pipeline(stages=[model1, lr]) paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100, 1000, 10000]) \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() crossval = CrossValidator(estimator=pipeline2, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=5) # Run cross-validation, and choose the best set of parameters. cvModel = crossval.fit(training_data) # model_path = os.path.join(models_dir , time.strftime("%Y%m%d-%H%M%S") + '_' # + collection["Id"] + '_' # + collection["name"]) # cvModel.save(sc, model_path) return cvModel
def tf_idf_feature(wordsData): hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) for features_label in rescaledData.select("features", "id").take(3): print(features_label)
def textPredict(request): """6.文本聚类,热度预测""" label = request.POST['label'] title = request.POST['title'] conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) """处理数据集,生成特征向量""" dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet') print(dfTitles.dtypes) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(dfTitles) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show() for features_label in rescaledData.select("features", "rawFeatures").take(3): print(features_label) """决策树模型培训""" labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData) featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData) (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3]) dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt]) model = pipeline.fit(trainingData) """模型测试""" predictions = model.transform(testData) predictions.show() predictions.select("prediction", "indexedLabel", "features").show(5) """用户数据测试,单个新闻测试""" sentenceData = sqlContext.createDataFrame([ (label,title), ],['label',"title"]) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) rescaledData = idfModel.transform(featurizedData) myprediction = model.transform(rescaledData) print("==================================================") myprediction.show() resultList = convertDfToList(myprediction) """模型评估""" evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="precision") accuracy = evaluator.evaluate(predictions) print("Test Error = %g " % (1.0 - accuracy)) treeModel = model.stages[2] print(treeModel) sc.stop() return render(request,{'resultList':resultList})
def extract_tf_features(p_df, input_col, output_col): """ Extracts TF features. :param p_df: A DataFrame. :param in_column: Name of the input column. :param out_column: Name of the output column. :return: A DataFrame. """ hashingTF = HashingTF(inputCol=input_col, outputCol=output_col, numFeatures=3000) return hashingTF.transform(p_df)
def term_frequency(df, column): """ Compute term-frequency of a token contained in a column. Transformation: array<string> --> vector """ tf = HashingTF(inputCol=column, outputCol='_'+column) df = tf.transform(df) df = replace(df, column, '_'+column) return df
def run_tf_idf_spark_ml(df, numFeatures=1 << 20): tokenizer = Tokenizer(inputCol="body", outputCol="words") wordsData = tokenizer.transform(df) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numFeatures) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) return idfModel.transform(featurizedData)
def tfidf(dataframe, in_col1, out_col1, in_col2, out_col2, n): global idfModel hashingTF = HashingTF(inputCol=in_col1, outputCol=out_col1, numFeatures=n) featurizedData = hashingTF.transform(dataframe) idf = IDF(inputCol=in_col2, outputCol=out_col2) idfModel = idf.fit(featurizedData) dataframe = idfModel.transform(featurizedData) return dataframe
def test_apply_binary_term_freqs(self): df = self.spark.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"]) n = 10 hashingTF = HashingTF() hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True) output = hashingTF.transform(df) features = output.select("features").first().features.toArray() expected = Vectors.dense([1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]).toArray() for i in range(0, n): self.assertAlmostEqual(features[i], expected[i], 14, "Error at " + str(i) + ": expected " + str(expected[i]) + ", got " + str(features[i]))
def predictLabel(label,title,model): """预测新闻的标签""" sentenceData = sqlContext.createDataFrame([ (label,title), ],['label',"title"]) tokenizer = Tokenizer(inputCol="title", outputCol="words") wordsData = tokenizer.transform(sentenceData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(wordsData) rescaledData = idfModel.transform(featurizedData) myprediction = model.transform(rescaledData) return myprediction
def create_features(raw_data): #Create DataFrame data_df = sqlContext.createDataFrame(raw_data.map(lambda r : Row(appid=r[0], price=r[1], sentence=r[2]))) #Transform sentence into words tokenizer = Tokenizer(inputCol='sentence', outputCol='words') words_df = tokenizer.transform(data_df) #Calculate term frequency hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=5) featurized_df = hashingTF.transform(words_df) #Calculate inverse document frequency idf = IDF(inputCol='rawFeatures', outputCol='features') idfModel = idf.fit(featurized_df) return idfModel.transform(featurized_df)
def tf_feature_vectorizer(df,no_of_features,ip_col): #from pyspark.sql.functions import udf #from pyspark.sql.types import * output_raw_col = ip_col+"raw_features" output_col = ip_col+"features" hashingTF = HashingTF(inputCol=ip_col, outputCol=output_raw_col, numFeatures=no_of_features) featurizedData = hashingTF.transform(df) idf = IDF(inputCol=output_raw_col, outputCol=output_col) idfModel = idf.fit(featurizedData) rescaled_data = idfModel.transform(featurizedData) rescaled_data.show(5) print(rescaled_data.count()) return rescaled_data
def makeTFIDF(sc, spark, reviews): # count vectorizer and tfidf # cv = CountVectorizer(inputCol='words_clean', outputCol='tf') # cvModel = cv.fit(reviews) # reviews = cvModel.transform(reviews) # HashingTF for fewer dimensions: hashingtf = HashingTF(inputCol='words_clean', outputCol='tf', numFeatures=1000) reviews = hashingtf.transform(reviews) # create TF-IDF matrix idf = IDF().setInputCol('tf').setOutputCol('tfidf') tfidfModel = idf.fit(reviews) reviews = tfidfModel.transform(reviews)
def _build_stages(self): self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed") self.tokenizer = Tokenizer(inputCol=self.bs_parser.getOutputCol(), outputCol="words") self.hashing_tf = HashingTF(inputCol=self.tokenizer.getOutputCol(), outputCol="raw_features") self.idf_model = IDF(inputCol=self.hashing_tf.getOutputCol(), outputCol="features") self.lr = LogisticRegression(maxIter=10, regParam=0.01) return [self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr]
def append_tf_idf(self, df): """ Calculate term frequency and inverse document frequency based on at least 1 visit hourly in this case. Compares how often the tokens appeared at least once per hour compared to other tokens. Not used for the main purpose of the project. Args: :param df: Dataframe parameter. Returns: :return: Dataframe with term frequency and inverse document frequency added in the columns 'rawFeatures' and 'features' respectively. """ #Create TF column. hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=100000) tf = hashingTF.transform(df) tf.persist(StorageLevel.MEMORY_AND_DISK) #Create IDF column. idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(tf) tfidf = idfModel.transform(tf) return tfidf
def create_pipeline(model_type, num_features=10000): """ Defines pipeline from BOW to prediction. """ remover = StopWordsRemover(inputCol="bow", outputCol="words") hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts", numFeatures=num_features) tfidf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") if model_type == 'log_reg': model = LogisticRegression() elif model_type == 'gbt': model = GBTClassifier() elif model_type == 'naive_bayes': model = NaiveBayes() elif model_type == 'rf': model = RandomForestClassifier() return Pipeline(stages=[remover, hashingTF, tfidf, model])
def kmeansresults(): df1 = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load \ ("canadatweets.csv") df2 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("products.csv") df3 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("products.csv") df4 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("claritin.csv") df = df1.unionAll(df2) df = df.unionAll(df3) df = df.unionAll(df4) df.show() # df2.show() tokenizer = Tokenizer(inputCol="text", outputCol="tokens") remover = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens") hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=2**20) idf = IDF(inputCol="rawFeatures", outputCol="features") kmeans = KMeans(k=8, seed=1, featuresCol='rawFeatures', maxIter=10, initMode='random') pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, kmeans]) pipeline.save("KMeansPipeline") model = pipeline.fit(df) results = model.transform(df) results.cache() results.groupBy("prediction").count().show( ) # Note "display" is for Databricks; use show() for OSS Apache Spark # results.filter(results.prediction == 1).show(200,False) results.show() results.toPandas().to_csv( 'kmeansresultsCanadaAndProductsAndDisastersAndClaritin.csv') model.stages[-1].save("KMeansModel")
def test_get_params_to_log(spark_session): # pylint: disable=unused-argument lor = LogisticRegression(maxIter=3, standardization=False) lor_params = get_params_to_log(lor) assert ( lor_params["maxIter"] == 3 and not lor_params["standardization"] and lor_params["family"] == lor.getOrDefault(lor.family) ) ova = OneVsRest(classifier=lor, labelCol="abcd") ova_params = get_params_to_log(ova) assert ( ova_params["classifier"] == "LogisticRegression" and ova_params["labelCol"] == "abcd" and ova_params["LogisticRegression.maxIter"] == 3 and ova_params["LogisticRegression.family"] == lor.getOrDefault(lor.family) ) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) inner_pipeline = Pipeline(stages=[hashingTF, ova]) nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline]) pipeline_params = get_params_to_log(pipeline) nested_pipeline_params = get_params_to_log(nested_pipeline) assert pipeline_params["stages"] == ["Tokenizer", "HashingTF", "OneVsRest"] assert nested_pipeline_params["stages"] == ["Tokenizer", "Pipeline_2"] assert nested_pipeline_params["Pipeline_2.stages"] == ["HashingTF", "OneVsRest"] assert nested_pipeline_params["OneVsRest.classifier"] == "LogisticRegression" for params_to_test in [pipeline_params, nested_pipeline_params]: assert ( params_to_test["Tokenizer.inputCol"] == "text" and params_to_test["Tokenizer.outputCol"] == "words" ) assert params_to_test["HashingTF.outputCol"] == "features" assert params_to_test["OneVsRest.classifier"] == "LogisticRegression" assert params_to_test["LogisticRegression.maxIter"] == 3
def compute_clusters(addons_df, num_clusters, random_seed): """ Performs user clustering by using add-on ids as features. """ # Build the stages of the pipeline. We need hashing to make the next # steps work. hashing_stage = HashingTF(inputCol="addon_ids", outputCol="hashed_features") idf_stage = IDF(inputCol="hashed_features", outputCol="features", minDocFreq=1) # As a future improvement, we may add a sane value for the minimum cluster size # to BisectingKMeans (e.g. minDivisibleClusterSize). For now, just make sure # to pass along the random seed if needed for tests. kmeans_kwargs = {"seed": random_seed} if random_seed else {} bkmeans_stage = BisectingKMeans(k=num_clusters, **kmeans_kwargs) pipeline = Pipeline(stages=[hashing_stage, idf_stage, bkmeans_stage]) # Run the pipeline and compute the results. model = pipeline.fit(addons_df) return ( model .transform(addons_df) .select(["client_id", "prediction"]) )
def RF_Model(train_dataframe, test_dataframe): ''' Takes the argument as a train_dataframe, test_dataframe implements the pipeline of RegexTokenizer, NGrams =3 , HashingTF, IDF and Random Forest Classifier and predicts the label based on features of test_dataframe. The Pattern RegexTokenizer is set to "\\W|\b(00|CC)\b" because it removes all nonwords that is extra spaces or punctuations, '??', '00' and 'CC' are removed as these are most repeated words and accuracy is significantly improved. Args: dataframe: -The train_dataframe should consist of the columns, 'label' and 'text'. -The test_dataframe should consist of the column 'text'. Returns: DataFrame['prediction': double, given_order: bigint, label: string] iff data read initially is a small dataset else DataFrame['prediction': double, given_order: bigint] data read initially is a big dataset ''' train_dataframe = train_dataframe.repartition(96)\ .withColumn('label', train_dataframe['label'].cast(IntegerType())) regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W|\b(00|CC)\b") ngram = NGram(n=3, inputCol="words", outputCol="ngrams") hashingTF = HashingTF(inputCol="ngrams", outputCol="TF") idf = IDF(inputCol="TF", outputCol="features") rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=30) pipeline = Pipeline(stages=[regexTokenizer, ngram, hashingTF, idf, rf]) model = pipeline.fit(train_dataframe) predictions_df = model.transform(test_dataframe) return predictions_df\ .drop('rawfeatures', 'n_grams', 'TF', 'text', 'words', 'features')
class BaselinePipelineEngine(PipelineEngine): @keyword_only def __init__(self, cv): super(BaselinePipelineEngine, self).__init__(cv) self.hashing_tf_map = [pow(2, 20)] self.lr_map = [0.1, 0.01] self.stages = self._build_stages() self.pipeline = Pipeline(stages=[self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr]) self.param_grid = self._build_param_grid() def _build_stages(self): self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed") self.tokenizer = Tokenizer(inputCol=self.bs_parser.getOutputCol(), outputCol="words") self.hashing_tf = HashingTF(inputCol=self.tokenizer.getOutputCol(), outputCol="raw_features") self.idf_model = IDF(inputCol=self.hashing_tf.getOutputCol(), outputCol="features") self.lr = LogisticRegression(maxIter=10, regParam=0.01) return [self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr] def _build_param_grid(self): param_grid_builder = ParamGridBuilder() param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map) param_grid_builder.addGrid(self.lr.regParam, self.lr_map) return param_grid_builder.build()
def __init__(self, sc, sql_context, dataset_path): """Init the recommendation engine given a Spark context and a dataset path """ logger.info("Starting up the Sentiment Analyser Engine:") self.sc = sc self.sql_context = sql_context # Load sentiment data for later use logger.info("Loading Sentiment data...") sentiment_file_path = os.path.join(dataset_path, 'arabic_tweets_labeled.csv') sentiment_RDD = self.sql_context.read.format( 'com.databricks.spark.csv').options( header=True, inferSchema='true').load(sentiment_file_path) sentiment_RDD = sentiment_RDD.dropna() tokenizer = Tokenizer(inputCol="tweet", outputCol="words") hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf') idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) # minDocFreq: remove sparse terms label_stringIdx = StringIndexer(inputCol="target", outputCol="label") pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx]) sentiment_RDD.show() pipelineFit = pipeline.fit(sentiment_RDD) data = pipelineFit.transform(sentiment_RDD) self.pipelineFit = pipelineFit self.data = data (train_set, test_set) = data.randomSplit([0.8, 0.2], seed=2000) self.train_set = train_set self.test_set = test_set # Train the model self.seed = 1245 self.iterations = 100 self.__train_model()
def test_get_instance_param_map(spark_session): # pylint: disable=unused-argument lor = LogisticRegression(maxIter=3, standardization=False) lor_params = _get_instance_param_map(lor) assert (lor_params["maxIter"] == 3 and not lor_params["standardization"] and lor_params["family"] == lor.getOrDefault(lor.family)) ova = OneVsRest(classifier=lor, labelCol="abcd") ova_params = _get_instance_param_map(ova) assert (ova_params["classifier"] == lor.uid and ova_params["labelCol"] == "abcd" and ova_params[f"{lor.uid}.maxIter"] == 3 and ova_params[f"{lor.uid}.family"] == lor.getOrDefault(lor.family)) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) inner_pipeline = Pipeline(stages=[hashingTF, ova]) nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline]) pipeline_params = _get_instance_param_map(pipeline) nested_pipeline_params = _get_instance_param_map(nested_pipeline) assert pipeline_params["stages"] == [tokenizer.uid, hashingTF.uid, ova.uid] assert nested_pipeline_params["stages"] == [ tokenizer.uid, { inner_pipeline.uid: [hashingTF.uid, ova.uid] }, ] for params_to_test in [pipeline_params, nested_pipeline_params]: assert (params_to_test[f"{tokenizer.uid}.inputCol"] == "text" and params_to_test[f"{tokenizer.uid}.outputCol"] == "words") assert params_to_test[f"{hashingTF.uid}.outputCol"] == "features" assert params_to_test[f"{ova.uid}.classifier"] == lor.uid assert params_to_test[f"{lor.uid}.maxIter"] == 3
def ldaresults(): df1 = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load \ ("canadatweets.csv") df2 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("products.csv") df3 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("products.csv") df4 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("claritin.csv") df = df1.unionAll(df2) df = df.unionAll(df3) df = df.unionAll(df4) df.show() # df2.show() tokenizer = Tokenizer(inputCol="text", outputCol="tokens") remover = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens") hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=2**18) idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) lda = LDA(k=8, seed=1, optimizer="em", featuresCol='features') pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lda]) model = pipeline.fit(df) topics = model.stages[-1].describeTopics() # topics.show(truncate=False) transformed = model.transform(df) # transformed.sort('topicDistribution').show(20000,truncate=False) # transformed.toPandas().to_csv('ldaresultsCanadaAndProductsAndDisastersAndClaritin.csv') transformed.rdd.map(lambda row: (row['text'] ,row['features'] ,row['topicDistribution'],int(np.argmax(np.asarray([float(x) for x in row['topicDistribution']])))))\ .toDF()\ .toPandas().to_csv('ldaresultsCanadaAndProductsAndDisastersAndClaritin.csv')
def kmeans_from_csv2(file, outfile, k=8): df = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load \ (file) df.show() # df2.show() tokenizer = Tokenizer(inputCol="text", outputCol="tokens") remover = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens") hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=2**20) idf = IDF(inputCol="rawFeatures", outputCol="features") pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf]) model = pipeline.fit(df) results = model.transform(df) results.cache() #results.groupBy("prediction").count().show() # Note "display" is for Databricks; use show() for OSS Apache Spark # results.filter(results.prediction == 1).show(200,False) results.show() #results.toPandas().to_csv(outfile) # Trains a k-means model. xaxis = [] yaxis = [] for k in range(2, 11): xaxis.append(k) kmeans = KMeans().setK(k).setSeed(1) model = kmeans.fit(results) # Evaluate clustering by computing Within Set Sum of Squared Errors. wssse = model.computeCost(results) yaxis.append(wssse) print("Within Sum of Squared Errors for k= " + str(k) + "is " + str(wssse)) plt.plot(xaxis, yaxis) plt.show()
def train_validate(self, df): # Split the data into training and test sets (30% held out for testing) (training, test) = df.randomSplit([0.7, 0.3]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered") hashingTF = HashingTF(numFeatures=10000, inputCol=remover.getOutputCol(), outputCol="features") #################### # lr = LogisticRegression(maxIter=10, regParam=0.001) # pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, lr]) #################### # instantiate the base classifier. lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True) # instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, ovr]) ##################### # Fit the pipeline to training documents. model = pipeline.fit(training) # Make predictions on test documents and print columns of interest. prediction = model.transform(test) # obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # compute the classification error on test data. accuracy = evaluator.evaluate(prediction) print("Test Error : " + str(1 - accuracy)) return model
def preprocess(spark_session, data_file): raw_data = spark_session.read.format('json').load(data_file) regexTokenizer = RegexTokenizer(inputCol='text', outputCol='words', pattern='\\w+', gaps=False, toLowercase=True) stopWordsRemover = StopWordsRemover(inputCol='words', outputCol='filtered_words') hashingTF = HashingTF(inputCol='filtered_words', outputCol='tf_features', numFeatures=20) idf = IDF(inputCol='tf_features', outputCol='features') pipeline = Pipeline( stages=[regexTokenizer, stopWordsRemover, hashingTF, idf]) pipeline_model = pipeline.fit(raw_data) data = pipeline_model.transform(raw_data) return data
def build_model_pipeline(): """ TF (term frequency): number of times the word occurs in a sepcific document DF (document frequency): number of times a word coccurs in collection of documents TF-IDF (TF - inverse DF): measures the significace of a word in a document """ # 1. tokenize words, convert word to lowercase tokenizer = RegexTokenizer(inputCol='review', outputCol='review_tokens_uf', pattern='\\s+|[(),.!?\";]', toLowercase=True) # 2. remove stopwords stopwords_remover = StopWordsRemover( stopWords=StopWordsRemover.loadDefaultStopWords('english'), inputCol='review_tokens_uf', outputCol='review_tokens') # 3. TF # cv = CountVectorizer( # inputCol='review_tokens', # outputCol='tf', # vocabSize=200000 # ) cv = HashingTF(inputCol='review_tokens', outputCol='tf') # 4. IDF idf = IDF(inputCol='tf', outputCol='features') # 5. NB nb = NaiveBayes() pipeline = Pipeline(stages=[tokenizer, stopwords_remover, cv, idf, nb]) return pipeline
def getOrCreateNBModel(sc): # Load the pipeline from disk. loaded = PipelineModel.load('./nbmodel') # Returned the model loaded from the disk if found # if loaded: # return loaded # Else create the model/PipelineModel, save and return it. (df, spark) = loadSentiment140(sc, SENTIMENT140_DATA) # tokenizer = Tokenizer(inputCol='status', outputCol='barewords') # remover = StopWordsRemover(inputCol='barewords', outputCol='filtered')# , stopWords=removeStopWords()) # print('Remover', remover.transform(df).head()) tokenizer = TweetSanitizer(inputCol='status', outputCol='filtered') hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='features') # Defined model parameters nb = NaiveBayes(smoothing=1.0, modelType="multinomial") # Defined Pipeline pipeline = Pipeline(stages=[tokenizer, hashingTF, nb]) # Train the data model = pipeline.fit(df) # Save the pipeline, overwrite if already present. # This won't work with PySpark's custom transformer # model.write().overwrite().save('./nbmodel') return model
def postreview(): form = PostReviewForm() if form.validate_on_submit(): if form.review.data is not None: Text = [form.review.data] df = pd.DataFrame({'Text': Text}) df2 = sqlContext.createDataFrame(df) df2 = df2.dropna() text = "Text" target = "useful" tokenizer = Tokenizer(inputCol=text, outputCol="words") hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf') idf = IDF(inputCol='tf', outputCol="features") label_stringIdx = StringIndexer(inputCol=target, outputCol="label") pipeline = Pipeline( stages=[tokenizer, hashtf, idf, label_stringIdx]) transform_df = pipelineFit.transform(df2) predictions = lrModel_final.transform(transform_df) prediction_value = predictions.withColumn( "value", predictions["prediction"].cast(IntegerType())) output = prediction_value.select('value').take(1)[0][0] if output == 1: flash( f'Thanks for posting your review, this was really usefull..!', 'success') return redirect(url_for('postreview', _anchor='review_form')) else: flash( 'Thanks for posting, really appreciate if you can share more details..', 'danger') return redirect(url_for('postreview', _anchor='review_form')) return render_template('index.html', form=form)
def main(): processed_path = sys.argv[1] output_file = sys.argv[2] comm = spark.read.csv(processed_path, schema=comments_schema).repartition(2000).cache() tokenizer = Tokenizer(inputCol="comments", outputCol="words") # wordsDF = tokenizer.transform(comm) hashing = HashingTF(inputCol="words", outputCol="features") # count_vect = CountVectorizer(inputCol="words", outputCol="features") # cv_model = count_vect.fit(wordsDF) # df_features = cv_model.transform(wordsDF) # corpus = df_features.select(col('id'), col('features')).cache() lda = LDA(k=10, maxIter=10, optimizer='online') # lda_model = lda.fit(corpus) pipeline = Pipeline(stages=[tokenizer, hashing, lda]) model = pipeline.fit(comm) transformed = model.transform(comm).selectExpr('id', 'topicDistribution') topic_text = udf(to_text) topics_df = transformed.select( transformed['id'], topic_text( transformed['topicDistribution']).alias('topicDistribution')) # topics_df.show(truncate=False) topics_df.write.option('sep', ',').save(output_file, format='csv', mode='overwrite')
def sample_tf_idf(self, mergeRDD, nl_idfModel, ece_idfModel): dataDF = mergeRDD.map( lambda p: Row(**{ 'lable': p[0], 'edu_city_exp': p[1], 'leibie and name': p[2] })).toDF() nl_hashingTF = HashingTF(inputCol='leibie and name', outputCol='nlFeatures', numFeatures=256) featuresData = nl_hashingTF.transform(dataDF) ece_hashingTF = HashingTF(inputCol='edu_city_exp', outputCol='eceFeatures', numFeatures=64) featuresData = ece_hashingTF.transform(featuresData) rescled = nl_idfModel.transform(featuresData) rescled = ece_idfModel.transform(rescled) RDD = rescled.rdd featuresRDD = RDD.map(lambda i: (i.lable, i.nlfeatures.toArray( ).tolist() + i.ecefeatures.toArray().tolist())) return featuresRDD
def tf_idf(self, dataRDD): dataDF = dataRDD.map(lambda i: Row( **{ 'name_and_desp': desp_text_division(i.name + ',' + i.work_desp ), 'salary': i.mon_wa, 'education': [i.education], 'city': [i.work_area], 'work_lable': [i.work_lable], 'work_exp': [i.work_exp] })).map(lambda i: Row( **{ 'name_and_desp': i.name_and_desp, 'salary': i.salary, 'agg': i.education + i.city + i.work_lable + i.work_exp })).toDF() dataDF.show() nd_hashingTF = HashingTF(inputCol='name_and_desp', outputCol='ndFeatures', numFeatures=10240) f_hashingTF = HashingTF(inputCol='agg', outputCol='Features_agg', numFeatures=256) tfdata = nd_hashingTF.transform(dataDF) tfdata = f_hashingTF.transform(tfdata) nd_idf = IDF(inputCol='ndFeatures', outputCol='ndfeatures') f_idf = IDF(inputCol='Features_agg', outputCol='features_agg') nd_idf_model = nd_idf.fit(tfdata) f_idf_model = f_idf.fit(tfdata) nd_idf_model.save('hdfs://localhost:9000/nd_idf') f_idf_model.save('hdfs://localhost:9000/agg_idf') tf_idfdata = nd_idf_model.transform(tfdata) tf_idfdata = f_idf_model.transform(tf_idfdata) featuresRDD = tf_idfdata.select('salary', 'ndfeatures', 'features_agg').rdd featuresRDD = featuresRDD.map( lambda i: (int(i.salary), i.ndfeatures.toArray().tolist() + i. features_agg.toArray().tolist())) return featuresRDD
def tf_idf(self, mergeRDD): fields = [ StructField('lable', IntegerType(), nullable=True), StructField('edu_city_exp', ArrayType(elementType=StringType()), nullable=True), StructField('leibie_name', ArrayType(elementType=StringType()), nullable=True) ] schema = StructType(fields) rowRDD = mergeRDD.map(lambda p: Row(p[0], p[1], p[2])) info_df = self.spark.createDataFrame(schema=schema, data=rowRDD).toDF( 'lable', 'edu_city_exp', 'leibie and name') info_df.show() name_df = info_df.select('lable', 'edu_city_exp', 'leibie and name') nl_hashingTF = HashingTF(inputCol='leibie and name', outputCol='nlFeatures', numFeatures=256) featurizeData = nl_hashingTF.transform(name_df) ece_hashingTF = HashingTF(inputCol='edu_city_exp', outputCol='eceFeatures', numFeatures=64) featurizeData = ece_hashingTF.transform(featurizeData) nl_idf = IDF(inputCol='nlFeatures', outputCol='nlfeatures') ece_idf = IDF(inputCol='eceFeatures', outputCol='ecefeatures') nl_idfModel = nl_idf.fit(featurizeData) ece_idfModel = ece_idf.fit(featurizeData) rescaledData = nl_idfModel.transform(featurizeData) rescaledData = ece_idfModel.transform(rescaledData) tf_idfmerge = [] for i in rescaledData.select('lable', 'nlfeatures', 'ecefeatures').collect(): ele_lst = i.nlfeatures.toArray().tolist() + i.ecefeatures.toArray( ).tolist() tf_idfmerge.append((int(i.lable), ele_lst)) print(tf_idfmerge) featuresRDD = self.sc.parallelize(tf_idfmerge) return featuresRDD
train = train.na.drop() test = test.na.drop() for col in train.columns: if col in [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ]: train = train.withColumn(col, train[col].cast(T.FloatType())) #Main code out_cols = [i for i in train.columns if i not in ["id", "comment_text"]] tokenizer = Tokenizer(inputCol="comment_text", outputCol="words") wordsData = tokenizer.transform(train) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") tf = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(tf) tfidf = idfModel.transform(tf) REG = 0.1 lr = LogisticRegression(featuresCol="features", labelCol='toxic', regParam=REG) lrModel = lr.fit(tfidf.limit(5000)) res_train = lrModel.transform(tfidf) res_train.select("id", "toxic", "probability", "prediction").show(20) res_train.show(5) extract_prob = F.udf(lambda x: float(x[1]), T.FloatType()) (res_train.withColumn("proba", extract_prob("probability")).select( "proba", "prediction").show())
return tweet.strip() data = sc.textFile(trainingFile) header = data.first() rdd = data.filter(lambda row: row != header) r = rdd.mapPartitions(lambda x : csv.reader(x)) r2 = r.map(lambda x: (processTweetText(x[3]), int(x[1]))) parts = r2.map(lambda x: Row(sentence=x[0], label=int(x[1]))) partsDF = spark.createDataFrame(parts).orderBy(rand()).limit(maxLines) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") remover = StopWordsRemover(inputCol="words", outputCol="base_words") hashingTF = HashingTF(numFeatures=6000, inputCol="base_words", outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.05, elasticNetParam=0.025, family="binomial") pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, lr]) (trainSet, testSet) = partsDF.randomSplit([trainPercent, testPercent], 1291) lrModel = pipeline.fit(trainSet) lrResult = lrModel.transform(testSet) avg = lrResult.where('label == prediction').count() / (maxLines * testPercent) print(avg) #Adjust maxIter to the number of iterations needed to reach convergence (check if it decreases less than pow(10,-3)) #import matplotlib.pyplot as plt #a = lrModel.stages[-1].summary.objectiveHistory
## Tokenize the messages tokenizer = RegexTokenizer(inputCol="text", outputCol="words", minTokenLength=3, gaps=False, pattern="[a-zA-Z]+") ## Remove ignored words stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) ## Create H2OAutoML model automl = H2OAutoML(convertUnknownCategoricalLevelsToNa=False, seed=1, maxRuntimeSecs=300, # 5 minutes predictionCol="label") ## Remove all helper columns colPruner = ColumnPruner(columns=[idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol()])
tokenizer = RegexTokenizer(inputCol="text", outputCol="words", minTokenLength=3, gaps=False, pattern="[a-zA-Z]+") ## Remove ignored words stopWordsRemover = StopWordsRemover( inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) ## Create H2ODeepLearning model dl = H2ODeepLearning(epochs=10, l1=0.001, l2=0.0, hidden=[200, 200], featuresCols=[idf.getOutputCol()], predictionCol="label") ## Remove all helper columns colPruner = ColumnPruner(columns=[
f = parts.map(lambda p: Row(tindex=int(p[0]),packageName=p[1],packagePermissions=p[2], label= int(float(p[3])),training=1)) linest = sc.textFile("/Users/admin/Desktop/KBSApp/KBSApp/permissionsData/dataSets/SVMDataGroundTruth.txt") partst = linest.map(lambda l: l.split(",")) ft = partst.map(lambda p: Row(tindex=int(p[0]),packageName=p[1],packagePermissions=p[2],label= int(float(p[3])),training=0)) alldata = f.union(ft) schemaApp = sqlContext.createDataFrame(alldata) schemaApp.registerTempTable("data") tokenizer = Tokenizer(inputCol="packagePermissions", outputCol="perms") permsData = tokenizer.transform(schemaApp) hashingTF = HashingTF(inputCol="perms", outputCol="rawFeatures") featurizedData = hashingTF.transform(permsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) wordsvectors = rescaledData["label","features"].map(lambda row: LabeledPoint(row[0], row[1])) model = LogisticRegressionWithLBFGS.train(wordsvectors, iterations=100) labelsAndPreds = wordsvectors.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(wordsvectors.count())
from pyspark.sql import SQLContext from pyspark.ml.feature import RegexTokenizer, HashingTF from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes from pyspark.mllib.tree import RandomForest ## Load Dataset df_pandas = pd.read_csv('sample.csv') ## Convert to Spark Dataframe sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(df_pandas) ## Tokenizer and Hashing tokenizer = RegexTokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(numFeatures=10000, inputCol="words", outputCol="features") df_feat = hashingTF.transform(tokenizer.transform(df)) ## Create LabeledPoint and Features for Prediction (predict the 1s observations) lp = df_feat.map(lambda x: LabeledPoint(x.label, x.features)) predict_feat = df_feat.where(df_feat.label == 1).map(lambda x: x.features) ## Compare predictions from Different Models ## Logistic Regression lrm = LogisticRegressionWithSGD.train(lp, iterations=10) logit_predict = lrm.predict(predict_feat) logit_predict.sum() #9112
(20,"apple iphone 6 16gb t mobile"), (20,"Apple iPhone Apple iPhone 6 16GB 412 2 cell 2895"), (20,"iPhone 6 T Mobile 16 GB"), (20,"Apple 6 16gb T Mobile") ], ["label","text"]) # Learn a mapping from words to Vectors. #word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="textVec") #model = word2Vec.fit(documentDF) #result = model.transform(documentDF) #print result.take(2) tokenizer = Tokenizer(inputCol="text", outputCol="tokenizedText") tokenizedTextData = tokenizer.transform(documentDF) hashingTF = HashingTF(inputCol="tokenizedText", outputCol="rawFeatures") featurizedData = hashingTF.transform(tokenizedTextData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) result1 = idfModel.transform(featurizedData) for features_label in result.select("label","pcaFeatures").take(10): print(features_label) wordsvectors = result["label","features"].map(lambda row: LabeledPoint(row[0], row[1]))
# 2. 英文的分词方法,tokenizer tokenizer = Tokenizer(inputCol="MANUFACTURER_NAME_EN_STANDARD", outputCol="MANUFACTURER_NAME_EN_WORDS") df_standard = tokenizer.transform(df_standard) # 3. 中文的分词,jieba df_standard = df_standard.withColumn( "MANUFACTURER_NAME_WORDS", manifacture_name_pseg_cut(df_standard.MANUFACTURER_NAME_STANDARD)) df_standard.select("MANUFACTURER_NAME_STANDARD", "MANUFACTURER_NAME_WORDS", "MANUFACTURER_NAME_EN_STANDARD", "MANUFACTURER_NAME_EN_WORDS").show(truncate=False) # 4. 构建机器学习的feature hashingTF_en = HashingTF(inputCol="MANUFACTURER_NAME_EN_WORDS", outputCol="raw_features_mnf_en", numFeatures=1000) man_en_idf = IDF(inputCol="raw_features_mnf_en", outputCol="features_mnf_en") hashingTF_cn = HashingTF(inputCol="MANUFACTURER_NAME_WORDS", outputCol="raw_features_mnf_cn", numFeatures=1000) man_cn_idf = IDF(inputCol="raw_features_mnf_cn", outputCol="features_mnf_cn") pipeline = Pipeline( stages=[hashingTF_en, man_en_idf, hashingTF_cn, man_cn_idf]) idf_model = pipeline.fit(df_standard) idf_model.write().overwrite().save( "s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/idf_model")
def main(): # read data yahoo = spark.read.csv(f'{BUILDDIR}/yahoo.csv', header=True) data = yahoo.select(['sector', 'description']).dropna() # tokenize texts based on regular expression tokenize = RegexTokenizer(inputCol='description', outputCol='words_all', pattern='\\W') # remove stop words stopwords = '\n'.join((DATADIR/'stopwords'/f).read_text().strip() for f in ('mysql.txt', 'nltk.txt')).splitlines() remove_stopwords = StopWordsRemover(inputCol='words_all', outputCol='words_clean').setStopWords(stopwords) # get words frequency using simple count (bag of words) add_wordcount = CountVectorizer(inputCol='words_clean', outputCol='words_count', vocabSize=10000, minDF=5) # get tf-idf words frequencies add_wordtf = HashingTF(inputCol='words_clean', outputCol='words_tf', numFeatures=10000) add_wordidf = IDF(inputCol='words_tf', outputCol='words_tfidf', minDocFreq=5) # prepare output values index_target = StringIndexer(inputCol='sector', outputCol='label') # data preparation pipeline pipeline_wordcount = Pipeline(stages=[ tokenize, remove_stopwords, add_wordcount, add_wordtf, add_wordidf, index_target, ]) # apply data preparation pipeline model_wordcount = pipeline_wordcount.fit(data) prepared = model_wordcount.transform(data) # split to training and testing training, testing = prepared.randomSplit([0.8, 0.2], seed=100500) # fit logistic regression models logistic_wordcount = LogisticRegression(regParam=0.3, elasticNetParam=0, featuresCol='words_count', labelCol='label', predictionCol='prediction', probabilityCol='probability') logistic_tfidf = LogisticRegression(regParam=0.3, elasticNetParam=0, featuresCol='words_tfidf', labelCol='label', predictionCol='prediction', probabilityCol='probability') evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', metricName='accuracy') for model, name in ( (logistic_wordcount, 'Word count + Logistic regression'), (logistic_tfidf, 'TF-IDF + Logistic regression')): predicted = model.fit(training).transform(testing) print(f'{name} model accuracy = {evaluator.evaluate(predicted)}') # fit hyperparameters grid = (ParamGridBuilder() .addGrid(logistic_wordcount.regParam, [0.1, 0.2, 0.3, 0.4]) .addGrid(logistic_wordcount.elasticNetParam, [0.0, 0.1, 0.2, 0.3]) .build() ) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', metricName='accuracy') cv = CrossValidator( estimator=logistic_wordcount, estimatorParamMaps=grid, numFolds=5, evaluator=evaluator, seed=100500, ) if not FINAL_MODEL.exists(): model_cv = cv.fit(prepared) model_cv.save(str(FINAL_MODEL)) else: model_cv = CrossValidatorModel.load(str(FINAL_MODEL)) breakpoint()
def main(sc, sqlContext): #start = timer() #print '---Pegando usuario, posts, tokens e categorias do MongoDB---' #start_i = timer() user = findUserById(iduser) posts = findPosts(user) tokens, category, categoryAndSubcategory = getTokensAndCategories() postsRDD = (sc.parallelize(posts).map(lambda s: (s[0], word_tokenize(s[1].lower()), s[2], s[3])) .map(lambda p: (p[0], [x for x in p[1] if x in tokens] ,p[2], p[3])) .cache()) #print '####levou %d segundos' % (timer() - start_i) #print '---Pegando produtos do MongoDB---' #start_i = timer() #print '####levou %d segundos' % (timer() - start_i) #print '---Criando corpusRDD---' #start_i = timer() stpwrds = stopwords.words('portuguese') corpusRDD = (postsRDD.map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3])) .filter(lambda x: len(x[1]) >= 20 or (x[2] == u'Post' and len(x[1])>0)) .cache()) #print '####levou %d segundos' % (timer() - start_i) #print '---Calculando TF-IDF---' #start_i = timer() wordsData = corpusRDD.map(lambda s: Row(label=int(s[0]), words=s[1], type=s[2])) wordsDataDF = sqlContext.createDataFrame(wordsData).unionAll(sqlContext.read.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet")) numTokens = len(tokens) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens) idf = IDF(inputCol="rawFeatures", outputCol="features") featurizedData = hashingTF.transform(wordsDataDF) idfModel = idf.fit(featurizedData) tfIDF = idfModel.transform(featurizedData).cache() postTFIDF = (tfIDF .filter(tfIDF.type==u'Post') #.map(lambda s: Row(label=s[0], type=s[1], words=s[2], rawFeatures=s[3], features=s[4], sentiment=SVM.predict(s[4]))) .cache()) #postTFIDF = postTFIDF.filter(lambda p: p.sentiment == 1) #print '####levou %d segundos' % (timer() - start_i) #print '---Carregando modelo---' #start_i = timer() NB = NaiveBayesModel.load(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria') SVM = SVMModel.load(sc, "/home/ubuntu/recsys-tcc-ml/models/svm") #print '####levou %d segundos' % (timer() - start_i) #print '---Usando o modelo---' #start_i = timer() predictions = (postTFIDF .map(lambda p: (NB.predict(p.features), p[0], SVM.predict(p.features))) .filter(lambda p: p[2]==1) .map(lambda p: (p[0], p[1])) .groupByKey() .mapValues(list) .collect()) #print '####levou %d segundos' % (timer() - start_i) #print '---Calculando similaridades---' #start_i = timer() suggestions = [] for prediction in predictions: category_to_use = category[int(prediction[0])] #print ' Calculando similaridades para a categoria: {}'.format(category_to_use) tf = tfIDF.filter(tfIDF.type==category_to_use).cache() for post in prediction[1]: postVector = postTFIDF.filter(postTFIDF.label == post).map(lambda x: x.features).collect()[0] sim = (tf .map(lambda x: (post, x.label, cossine(x.features, postVector))) .filter(lambda x: x[2]>=threshold) .collect()) if len(sim) > 0: suggestions.append(sim) #print '####levou %d segundos' % (timer() - start_i) if len(suggestions) > 0: #print '---Inserindo recomendacoes no MongoDB---' #start_i = timer() insertSuggestions(suggestions, iduser, posts)
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('clean_tweet.csv') ## (train_set, val_set, test_set) = df.randomSplit([0.98, 0.01, 0.01], seed = 2000) ## from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer from pyspark.ml.feature import StringIndexer from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import BinaryClassificationEvaluator ## # Try to fit the model using Pyspark's HashingTF tokenizer = Tokenizer(inputCol="text", outputCol="words") hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf') idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) # minDocFreq removes sparse terms label_stringIdx = StringIndexer(inputCol="target", outputCol="label") pipeline = Pipeline(stages = [tokenizer, hashtf, idf, label_stringIdx]) pipelineFit = pipeline.fit(train_set) train_df = pipelineFit.transform(train_set) val_df = pipelineFit.transform(val_set) # train_df.show(5) ## lr = LogisticRegression(maxIter=100) lrModel = lr.fit(train_df) predictions = lrModel.transform(val_df) ##
sm = SparkModel(sc, conn, rdd_path='rdd.pkl') bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow)) \ .sample(withReplacement=False, fraction=.5, seed=1) df = sqc.createDataFrame(bow_rdd, ['string_label', 'raw']) train_rdd, test_rdd = df.randomSplit([.8, .2], seed=1) results = [] num_features = 5000 min_doc_freq = 20 layers = [[5000, 2056, 512, 128, 2], [5000, 1000, 128, 2], [5000, 100, 2], [5000, 5000, 2]] for l in layers: remover = StopWordsRemover(inputCol="raw", outputCol="words") hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts", numFeatures=num_features) tfidf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features", minDocFreq=min_doc_freq) indexer = StringIndexer(inputCol="string_label", outputCol="label") mlpc = MultilayerPerceptronClassifier(maxIter=100, layers=l, blockSize=128) pipeline = Pipeline(stages=[remover, hashingTF, tfidf, indexer, mlpc]) model = pipeline.fit(train_rdd) df_output = model.transform(train_rdd) test_output = model.transform(test_rdd).select("label", "prediction") score = test_output.rdd.map(lambda row: row.label == row.prediction).mean()
##reading csv file data = pd.read_csv("sms_spam.csv") #print(data.head(5)) ##creating rdd file sc = SparkContext("local", "app") sqc = SQLContext(sc) df = sqc.createDataFrame(data, ['type', 'text']) #NEW VARIABLE GENERATION dataCleaned = df.map(lambda x: (1 if x['type'] == 'spam' else 0, tokenize(x['text']))) dataClean = dataCleaned.map(lambda x: (float(x[0]), x[1])) dfClean = sqc.createDataFrame(dataClean, ['label', 'words']) dfClean.show(5) hashingTF = HashingTF(inputCol="words", outputCol="rawtf-idf", numFeatures=1000) tf = hashingTF.transform(dfClean) idf = IDF(inputCol="rawtf-idf", outputCol="features").fit(tf) dfFinal = idf.transform(tf) # Fit on whole dataset to include all labels in index. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(dfFinal) # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(dfFinal) # Split the data into training and test sets (20% held out for testing) (trainingData, testData) = dfFinal.randomSplit([0.8, 0.2]) # Train the model.
## Tokenize the messages tokenizer = RegexTokenizer(inputCol="text", outputCol="words", minTokenLength=3, gaps=False, pattern="[a-zA-Z]+") ## Remove ignored words stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) if algo == "gbm": ## Create GBM model algoStage = H2OGBM(ratio=0.8, seed=1, featuresCols=[idf.getOutputCol()], predictionCol="label") elif algo == "dl":
spark = SparkSession.builder.master("local").appName("Word Count").config( "spark.some.config.option", "some-value").getOrCreate() df = spark.read.csv('file:///home/zfar/Sentiment Analysis Dataset.csv', header=True) df = df.select(df['ItemID'], df['SentimentText'], df['label']) training = df.selectExpr("cast(itemID as int) id", "SentimentText", "cast(label as int) label") tokenizer = Tokenizer(inputCol="SentimentText", outputCol="words") remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered") ngrams = NGram(n=2, inputCol=remover.getOutputCol(), outputCol="ngrams") hashingTF = HashingTF(inputCol=ngrams.getOutputCol(), outputCol="rawfeatures") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idffeatures") normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol="features", p=1.0) #lr = LogisticRegression(maxIter=10, regParam=0.001) nb = NaiveBayes(smoothing=1.0) pipeline = Pipeline( stages=[tokenizer, remover, ngrams, hashingTF, idf, normalizer, nb]) model = pipeline.fit(training) """ paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [10, 100, 1000]).addGrid(lr.regParam, [0.1, 0.01]).build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid,
def main(sc, sqlContext): start = timer() stpwrds = stopwords.words('english') tbl_translate = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('S') or unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('N')) print '---Pegando produtos---' start_i = timer() productRDD = sc.parallelize(findProductsByCategory([])) print '####levou %d segundos' % (timer()-start_i) print '---Criando corpus---' start_i = timer() corpusRDD = (productRDD.map(lambda s: (s[0], word_tokenize(s[1].translate(tbl_translate).lower()), s[2], s[3])) .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3] )) .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP'], s[2], s[3])) .cache()) print '####levou %d segundos' % (timer()-start_i) print '---Pegando e persistindo dados de categoria e tokens---' start_i = timer() tokens = corpusRDD.flatMap(lambda x: x[1]).distinct().collect() numTokens = len(tokens) category = productRDD.map(lambda x: x[2]).distinct().collect() categoryAndSubcategory = productRDD.map(lambda x: (x[2], x[3])).distinct().collect() insertTokensAndCategories(tokens, category, categoryAndSubcategory) print '####levou %d segundos' % (timer()-start_i) print '---Calculando TF-IDF dos produtos---' start_i = timer() wordsData = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], category=s[2], subcategory=s[3])) #persistir isso para que ele nao tenha que fazer de novo na predicaoo wordsDataDF = sqlContext.createDataFrame(wordsData) #persistindo para a predicao wordsDataForPrediction = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], type=s[2])) #persistir isso para que ele nao tenha que fazer de novo na predicaoo wordsDataForPredictionDF = sqlContext.createDataFrame(wordsDataForPrediction) if os.path.exists("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"): shutil.rmtree("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet") wordsDataForPredictionDF.write.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet") hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens) idf = IDF(inputCol="rawFeatures", outputCol="features") featurizedData = hashingTF.transform(wordsDataDF) idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) #VSM = rescaledData.map(lambda t: LabeledPoint(categoryAndSubcategory.index((t.category, t.subcategory)), t.features)) VSM = rescaledData.map(lambda t: LabeledPoint(category.index(t.category), t.features)) VSMTrain, VSMTest = VSM.randomSplit([8, 2], seed=0L) print '####levou %d segundos' % (timer()-start_i) print '--Criando modelo Naive Bayes---' start_i = timer() model = NaiveBayes.train(VSMTrain) if os.path.exists("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria"): shutil.rmtree("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria") model.save(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria') print '####levou %d segundos' % (timer()-start_i) print '---Testando modelo Naive Bayes---' start_i = timer() prediction = VSMTest.map(lambda p : (categoryAndSubcategory[int(model.predict(p.features))], categoryAndSubcategory[int(p.label)])) acuraccy = float(prediction.filter(lambda (x, v): x[0]==v[0]).count())/float(prediction.count()) print 'acuracidade de %f' % acuraccy print '####levou %d segundos' % (timer()-start_i) print '---Pegando os posts---' start_i = timer() posts = list() wb = load_workbook(filename = '/home/ubuntu/recsys-tcc-ml/base_sentimentos.xlsx') sheet = wb['Menes'] for row in sheet.iter_rows(row_offset=1): post = list() for cell in row: if cell.value is None: break post.append(1 if cell.value == 'Positive' or cell.value == 'Neutral' else 0 if cell.value == 'Negative' else removeAccents(cell.value)) if len(post) > 0: posts.append(tuple(post)) print '####levou %d segundos' % (timer()-start_i) print '---Criando corpus---' start_i = timer() postsRDD = sc.parallelize(posts) postCorpusRDD = (postsRDD.map(lambda s: (s[1], word_tokenize(s[0].translate(tbl_translate).lower()))) .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds])) .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP'])) .cache()) print '####levou %d segundos' % (timer()-start_i) print '---Calculando TF-IDF dos Posts---' start_i = timer() wordsData = postCorpusRDD.map(lambda s: Row(label=s[0], words=s[1])) wordsDataDF = sqlContext.createDataFrame(wordsData) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens) idf = IDF(inputCol="rawFeatures", outputCol="features") featurizedData = hashingTF.transform(wordsDataDF) idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) VSM = rescaledData.map(lambda t: LabeledPoint(t.label, t.features)) VSMTrain, VSMTest = VSM.randomSplit([8, 2], seed=0L) print '####levou %d segundos' % (timer()-start_i) print '--Criando modelo SVM---' start_i = timer() model = SVMWithSGD.train(VSMTrain, iterations=100) if os.path.exists("/home/ubuntu/recsys-tcc-ml/models/svm"): shutil.rmtree("/home/ubuntu/recsys-tcc-ml/models/svm") model.save(sc, "/home/ubuntu/recsys-tcc-ml/models/svm") print '---Testando modelo SVM---' start_i = timer() prediction = VSMTest.map(lambda p: (p.label, model.predict(p.features))) acuraccy = prediction.filter(lambda (v, p): v != p).count() / float(prediction.count()) print 'acuracidade de %f' % acuraccy print '####levou %d segundos' % (timer()-start_i) print 'O processo todo levou %d segundos' % (timer()-start)
def test_save_load_pipeline_estimator(self): temp_path = tempfile.mkdtemp() training = self.spark.createDataFrame([ (0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0), (4, "b spark who", 1.0), (5, "g d a y", 0.0), (6, "spark fly", 1.0), (7, "was mapreduce", 0.0), ], ["id", "text", "label"]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") ova = OneVsRest(classifier=LogisticRegression()) lr1 = LogisticRegression().setMaxIter(5) lr2 = LogisticRegression().setMaxIter(10) pipeline = Pipeline(stages=[tokenizer, hashingTF, ova]) paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100]) \ .addGrid(ova.classifier, [lr1, lr2]) \ .build() tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator()) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assert_param_maps_equal(loadedTvs.getEstimatorParamMaps(), paramGrid) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) # Run train validation split, and choose the best set of parameters. tvsModel = tvs.fit(training) # test save/load of CrossValidatorModel tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid) self.assertEqual(len(loadedModel.bestModel.stages), len(tvsModel.bestModel.stages)) for loadedStage, originalStage in zip(loadedModel.bestModel.stages, tvsModel.bestModel.stages): self.assertEqual(loadedStage.uid, originalStage.uid) # Test nested pipeline nested_pipeline = Pipeline(stages=[tokenizer, Pipeline(stages=[hashingTF, ova])]) tvs2 = TrainValidationSplit(estimator=nested_pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator()) tvs2Path = temp_path + "/tvs2" tvs2.save(tvs2Path) loadedTvs2 = TrainValidationSplit.load(tvs2Path) self.assert_param_maps_equal(loadedTvs2.getEstimatorParamMaps(), paramGrid) self.assertEqual(loadedTvs2.getEstimator().uid, tvs2.getEstimator().uid) # Run train validation split, and choose the best set of parameters. tvsModel2 = tvs2.fit(training) # test save/load of CrossValidatorModel tvsModelPath2 = temp_path + "/tvsModel2" tvsModel2.save(tvsModelPath2) loadedModel2 = TrainValidationSplitModel.load(tvsModelPath2) self.assertEqual(loadedModel2.bestModel.uid, tvsModel2.bestModel.uid) loaded_nested_pipeline_model = loadedModel2.bestModel.stages[1] original_nested_pipeline_model = tvsModel2.bestModel.stages[1] self.assertEqual(loaded_nested_pipeline_model.uid, original_nested_pipeline_model.uid) self.assertEqual(len(loaded_nested_pipeline_model.stages), len(original_nested_pipeline_model.stages)) for loadedStage, originalStage in zip(loaded_nested_pipeline_model.stages, original_nested_pipeline_model.stages): self.assertEqual(loadedStage.uid, originalStage.uid)
# COMMAND ---------- tfIdfIn = tokenized\ .where("array_contains(DescOut, 'red')")\ .select("DescOut")\ .limit(10) tfIdfIn.show(10, False) # COMMAND ---------- from pyspark.ml.feature import HashingTF, IDF tf = HashingTF()\ .setInputCol("DescOut")\ .setOutputCol("TFOut")\ .setNumFeatures(10000) idf = IDF()\ .setInputCol("TFOut")\ .setOutputCol("IDFOut")\ .setMinDocFreq(2) # COMMAND ---------- idf.fit(tf.transform(tfIdfIn)).transform(tf.transform(tfIdfIn)).show(10, False) # COMMAND ---------- from pyspark.ml.feature import Word2Vec
concat_string_arrays = concat(StringType()) df = df.withColumn( 'joined_tokens', concat_string_arrays(col('filtered_title_tokens'), col('filtered_sterm_tokens'), col('filtered_attr_tokens'))) joined_ngram = NGram(n=2, inputCol="joined_tokens", outputCol="joined_ngrams") df = joined_ngram.transform(df) ''' stemmingUdf = udf(stemming, ArrayType(StringType())) df = df.withColumn('stemmed_tokens', stemmingUdf('joined_tokens')) ''' joined_hashingTF = HashingTF(inputCol="joined_ngrams", outputCol="joined_rawFeatures", numFeatures=30000) df = joined_hashingTF.transform(df) joined_idf = IDF(inputCol="joined_rawFeatures", outputCol="features") joined_idfModel = joined_idf.fit(df) df = joined_idfModel.transform(df) ''' assembler = VectorAssembler( inputCols=['title_features','sterm_features','attr_features'], outputCol='features') df = assembler.transform(df)
from pyspark.sql import Row from pyspark.ml.feature import HashingTF, IDF, Tokenizer df = spark.read.load('/home/manh/Documents/data/result_pre.parquet') df = df.select('id', 'stemmed') rdd = df.select('stemmed').rdd pre_idf = rdd.map(lambda x: set(x[0])).flatMap(lambda x: x).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y) pre_idf_collect = pre_idf.collect() rdd_words = pre_idf.map(lambda x: Row(word=[x[0]])) df_words = spark.createDataFrame(rdd_words) hashingTF = HashingTF(inputCol="word", outputCol="rawFeatures", numFeatures=100000) featurizedData = hashingTF.transform(df_words) featurizedData.rdd.map(lambda x: (x.word[0], x['rawFeatures'].indices[0])).map(lambda x: '%s %s' % (x)).collect()
def main(): spark = SQLContext(SparkContext.getOrCreate()) # read data yahoo = spark.read.csv(f'{BUILDDIR}/yahoo.csv', header=True) data = yahoo.select(['sector', 'description']).dropna() breakpoint() # tokenize texts based on regular expression tokenize = RegexTokenizer(inputCol='description', outputCol='words_all', pattern=r'\W') breakpoint() # remove stop words stopwords = '\n'.join((DATADIR / 'stopwords' / f).read_text().strip() for f in ('mysql.txt', 'nltk.txt')).splitlines() remove_stopwords = StopWordsRemover( inputCol='words_all', outputCol='words_clean').setStopWords(stopwords) breakpoint() # get words frequency using simple count (bag of words) add_wordcount = CountVectorizer(inputCol='words_clean', outputCol='words_count', vocabSize=1000, minDF=2) breakpoint() # get tf-idf words frequencies add_wordtf = HashingTF(inputCol='words_clean', outputCol='words_tf', numFeatures=10000) add_wordidf = IDF(inputCol='words_tf', outputCol='words_tfidf', minDocFreq=2) breakpoint() # prepare output values index_target = StringIndexer(inputCol='sector', outputCol='label') breakpoint() # data preparation pipeline pipeline_wordcount = Pipeline(stages=[ tokenize, remove_stopwords, add_wordcount, add_wordtf, add_wordidf, index_target, ]) # apply data preparation pipeline model_wordcount = pipeline_wordcount.fit(data) prepared = model_wordcount.transform(data) breakpoint() # split to training and testing training, testing = prepared.randomSplit([0.8, 0.2], seed=100500) breakpoint() # fit logistic regression models logistic_wordcount = LogisticRegression(regParam=0.3, elasticNetParam=0, featuresCol='words_count', labelCol='label', predictionCol='prediction', probabilityCol='probability') logistic_tfidf = LogisticRegression(regParam=0.3, elasticNetParam=0, featuresCol='words_tfidf', labelCol='label', predictionCol='prediction', probabilityCol='probability') breakpoint() evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', metricName='accuracy') for model, name in ((logistic_wordcount, 'Word count + Logistic regression'), (logistic_tfidf, 'TF-IDF + Logistic regression')): predicted = model.fit(training).transform(testing) print(f'{name} model accuracy = {evaluator.evaluate(predicted)}') breakpoint()
conn = S3Connection() sc = set_spark_context() sqc = SQLContext(sc) sm = SparkModel(sc, conn, rdd_path='meta_rdd.pkl') logging.basicConfig(format='%(asctime)s %(message)s') grid_search = logging.getLogger('main') grid_search.setLevel(logging.DEBUG) handler = logging.FileHandler('../logs/grid_search.txt') grid_search.addHandler(handler) bow_rdd = sm.RDD.map(lambda (key, (bow, meta)): (key, bow)) bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow)) remover = StopWordsRemover(inputCol="raw", outputCol="words") hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts", numFeatures=10000) tfidf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features", minDocFreq=20) indexer = StringIndexer(inputCol="string_label", outputCol="label") for model in [GBTClassifier(), RandomForestClassifier(), MultilayerPerceptronClassifier()]: if type(model) == MultilayerPerceptronClassifier: layers = [10000, 100, 2] model = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128) pipeline = Pipeline(stages=[remover, hashingTF, tfidf, # scaler, indexer, model]) scores = cross_val_score(pipeline, bow_rdd) grid_search.debug('Model: %s\nscores: %s\nAverage: %s' \ % (type(model), scores, scores.mean()))
from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import HashingTF, Tokenizer # In[17]: # Prepare training documents from a list of (id, text, label) tuples. training = spark.createDataFrame([(0, "a b c d e spark", 1.0), (1, "b d", 0.0), (2, "spark f g h", 1.0), (3, "hadoop mapreduce", 0.0)], ["id", "text", "label"]) # In[18]: # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # In[19]: # Fit the pipeline to training documents. model = pipeline.fit(training) # In[20]: # Prepare test documents, which are unlabeled (id, text) tuples. test = spark.createDataFrame([(4, "spark i j k"), (5, "l m n"), (6, "spark hadoop spark"), (7, "apache hadoop")], ["id", "text"])
rdd = labeledRdd.map(lambda doc: (cleanLower(doc[0]), doc[1])) print "Text is cleaned" sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ["review", "label"]) dfTrain, dfTest = df.randomSplit([0.8, 0.2]) print "Random split is done" tokenizerNoSw = tr.NLTKWordPunctTokenizer( inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words("english")) ) hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf") idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed") dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10) pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt]) # **************************************************************** # *********************CROSS VALIDATION: 80%/20%****************** # *******************Model: DecisionTreeClassifier***************** # ***************************************************************** evaluator = MulticlassClassificationEvaluator( predictionCol="prediction", labelCol="target_indexed", metricName="precision" )
score = data.map(lambda s: 1.0 if s[1].isdigit() and float(s[1]) == 1.0 else 0.0) comment = data.map(lambda s: s[3]) split_neg_data2 = score.zip(comment) tranform_data = split_neg_data2.map( lambda p: (p[0], p[1])) #.toDF()#.withColumnRenamed('_1','label') #tranform_data.show() #sentenceData = spark.createDataFrame([(0, "I heard about Spark and I love Spark"),(0, "I wish Java could use case classes"),(1, "Logistic regression models are neat")]).toDF("label", "sentence") sentenceData = spark.createDataFrame(tranform_data, ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) #计算TF-IDF hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=3000) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features").show() forData = StringIndexer().setInputCol("label").setOutputCol("indexed").fit( rescaledData).transform(rescaledData) (trainingData, testData) = forData.randomSplit([0.8, 0.2], seed=0) print(trainingData.take(1)) nb = NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol="indexed") start_time = time.time() modelClassifier = nb.fit(trainingData) end_time = time.time()
review_text = BeautifulSoup(raw_review).text # # 2. Remove non-letters letters_only = re.sub("[^a-zA-Z]", " ", review_text) # # 3. Convert to lower case, split into individual words words = letters_only.lower().split() # # 4. Remove stop words meaningful_words = [w for w in words if not w in stops] # # 5. Join the words back into one string separated by space, # and return the result. return " ".join( meaningful_words) stops = set(stopwords.words("english")) lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv") rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys() parts = rows.map(lambda l: l.split("\t")) review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), review=review_to_words(p[2]))) schemeReview = sqlContext.createDataFrame(review) tokenizer = Tokenizer(inputCol="review", outputCol="words") wordsData = tokenizer.transform(schemeReview) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) selectData = rescaledData.select("label","features")