def test_simplepipe(): df = SPARK_SESSION.sparkContext.\ parallelize([['this is a test'], ['this is another test']]).\ toDF(schema=types.StructType().add('sentence', types.StringType())) pl = feature.Tokenizer().setInputCol('sentence') | \ feature.CountVectorizer() | \ feature.IDF() pl_model = pl.fit(df) pl_model.transform(df).count()
def n_gram(df, input_col, n=2): """ Converts the input array of strings inside of a Spark DF into an array of n-grams. :param df: Pyspark dataframe to analyze :param input_col: Column to analyzer. :param n: number of elements per n-gram >=1. :return: Spark DataFrame with n-grams calculated. """ is_dataframe(df) tokenizer = feature.Tokenizer().setInputCol(input_col) | feature.StopWordsRemover() count = feature.CountVectorizer() gram = feature.NGram(n=n) | feature.CountVectorizer() tf = tokenizer | (count, gram) | feature.VectorAssembler() tfidf = tf | feature.IDF().setOutputCol('features') tfidf_model = tfidf.fit(df) df_model = tfidf_model.transform(df) return df_model, tfidf_model
def getTFIDF(closest): grouped_clusters = closest.groupBy("prediction")\ .agg(F.collect_list("split_aspect").alias("text"))\ .withColumn("text", F.concat_ws(" ", "text")) tokenizer = feat.Tokenizer(inputCol="text", outputCol="words") wordsData = tokenizer.transform(grouped_clusters) # get term freqs (using count vectorizer because it does hash the words and we can revert back to words from idx) cv = feat.CountVectorizer(inputCol="words", outputCol="rawFeatures").fit(wordsData) featurizedData = cv.transform(wordsData) # save vocab object vocab = cv.vocabulary # compute idf idf = feat.IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) tfidf = idfModel.transform(featurizedData) return tfidf, vocab
def test_unigram_and_bigram(): df = SPARK_SESSION.sparkContext. \ parallelize([['this is the best sentence ever'], ['this is however the worst sentence available']]). \ toDF(schema=types.StructType().add('sentence', types.StringType())) import requests stop_words = requests.get( 'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words' ).text.split() tokenizer = feature.Tokenizer().setInputCol( 'sentence') | feature.StopWordsRemover(stopWords=stop_words) unigram = feature.CountVectorizer() bigram = feature.NGram() | feature.CountVectorizer() trigram = feature.NGram(n=3) | feature.CountVectorizer() tf = tokenizer | (unigram, bigram, trigram) | feature.VectorAssembler() tfidf = tf | feature.IDF().setOutputCol('features') tfidf_model = tfidf.fit(df) assert_equal( tfidf_model.transform(df).select('sentence', 'features').count(), 2)
features = "text" label = "first_label" data_dir = "training_sample" logger.info("Starting Spark Context") spark = sparknlp.start() conf = (pyspark.SparkConf().set("spark.ui.showConsoleProgress", "true")) sc = pyspark.SparkContext.getOrCreate(conf=conf) sqlcontext = pyspark.SQLContext(sc) training_set = (sqlcontext.read.format("parquet").option( "header", True).load(data_dir)) # TF cv = sf.CountVectorizer(inputCol=features, outputCol="tf_features") # IDF idf = sf.IDF(inputCol="tf_features", outputCol="features") # StringIndexer label_string = sf.StringIndexer(inputCol=label, outputCol="label") # Logistic regression lr = LogisticRegression(maxIter=10, family="multinomial") pipeline = Pipeline(stages=[cv, idf, label_string, lr]) paramGrid = (ParamGridBuilder().addGrid(cv.vocabSize, [500, 1000, 1500]).addGrid( lr.regParam, [0.1, 0.01, 0.001]).build()) logger.info("Pipeline created ...") logger.info("Starts grid search ...")
.setStopWords(stopWords) \ .setCaseSensitive(False) normalizer = Normalizer() \ .setInputCols(["stem"]) \ .setOutputCol("normalized") finisher = Finisher() \ .setInputCols(["clean_tokens"]) \ .setOutputCols(["ntokens"]) \ .setOutputAsArray(True) \ .setCleanAnnotations(True) nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, stemmer, stopwords, finisher]) nlp_model = nlp_pipeline.fit(df) processed = nlp_model.transform(df).persist() processed = processed.withColumn("ntokens", remove_url(F.col("ntokens"))) tf = spark_ft.HashingTF(numFeatures=1 << 16, inputCol='ntokens', outputCol='tf') idf = spark_ft.IDF(minDocFreq=5, inputCol='tf', outputCol='tfidf') feature_pipeline = Pipeline(stages=[tf, idf]) feature_model = feature_pipeline.fit(processed) features = feature_model.transform(processed).persist() features.show(100, False) features = features.select('sponsoring_country', 'tweetid', 'userid', 'tweet_text', 'is_validation', 'tfidf') features.write.parquet('tweets_tfidf.parquet')
label = F.udf(lambda x: 1.0 if x == 'escalate' else 0.0, FloatType()) df = df.withColumn('label', label('label')) # In[23]: df.select('label').show() # In[24]: import pyspark.ml.feature as feat TF_ = feat.HashingTF(inputCol="words without stop", outputCol="rawFeatures", numFeatures=100000) IDF_ = feat.IDF(inputCol="rawFeatures", outputCol="features") # In[25]: pipelineTFIDF = Pipeline(stages=[TF_, IDF_]) # In[26]: pipelineFit = pipelineTFIDF.fit(df) df = pipelineFit.transform(df) # In[27]: