def test_simplepipe(): df = SPARK_SESSION.sparkContext.\ parallelize([['this is a test'], ['this is another test']]).\ toDF(schema=types.StructType().add('sentence', types.StringType())) pl = feature.Tokenizer().setInputCol('sentence') | \ feature.CountVectorizer() | \ feature.IDF() pl_model = pl.fit(df) pl_model.transform(df).count()
def test_ml_pipe(): df = sc. \ parallelize([Row(sentence='this is a test', label=0.), Row(sentence='this is another test', label=1.)]). \ toDF() pl = feature.Tokenizer().setInputCol('sentence') | feature.CountVectorizer() ml = pl | classification.LogisticRegression() ml_model = ml.fit(df) assert_equal(ml_model.transform(df).count(), 2)
def test_stackedml_pipe(): df = SPARK_SESSION.sparkContext. \ parallelize([Row(sentence='this is a test', label=0.), Row(sentence='this is another test', label=1.)]).\ toDF() pl = feature.Tokenizer().setInputCol( 'sentence') | feature.CountVectorizer() ml = pl | (classification.LogisticRegression(),) | feature.VectorAssembler() | \ classification.\ RandomForestClassifier() ml_model = ml.fit(df) assert_equal(ml_model.transform(df).count(), 2)
def logistic_regression_text(df, input_col): """ Runs a logistic regression for input (text) DataFrame. :param df: Pyspark dataframe to analyze :param input_col: Column to predict :return: DataFrame with logistic regression and prediction run. """ assert_spark_df(df) pl = feature.Tokenizer().setInputCol(input_col) | feature.CountVectorizer() ml = pl | classification.LogisticRegression() ml_model = ml.fit(df) df_model = ml_model.transform(df) return df_model, ml_model
def canonicaltokens(df, inputColumn, outputColumn): """ turn input column of strings into canonical format as output column of tokens return as output column added to the dataframe """ newname = df.withColumn("cleanname", \ f.regexp_replace(f.regexp_replace(f.rtrim(f.ltrim(f.col(inputColumn))), \ " (\w) (\w) ", "$1$2"), "(\w) (\w) (\w)$", "$1$2$3")) newtokenizer = mlf.Tokenizer(inputCol="cleanname", outputCol="words") chtokenized = newtokenizer.transform(newname).drop("cleanname") stopwordremover = mlf.StopWordsRemover(inputCol="words", outputCol=outputColumn) canonicalname = stopwordremover.transform(chtokenized).drop("words") return canonicalname
def test_multi_model_pipe(): df = SPARK_SESSION.sparkContext. \ parallelize([Row(sentence='this is a test', label=0.), Row(sentence='this is another test', label=1.)]).\ toDF() pl = feature.Tokenizer().setInputCol( 'sentence') | feature.CountVectorizer() models = (classification.LogisticRegression(), classification.RandomForestClassifier(), classification.LogisticRegression().setElasticNetParam(0.2), classification.GBTClassifier()) ml = pl | models | feature.VectorAssembler().setOutputCol('final_features') | \ classification.LogisticRegression() ml_model = ml.fit(df) assert_equal(ml_model.transform(df).count(), 2)
def n_gram(df, input_col, n=2): """ Converts the input array of strings inside of a Spark DF into an array of n-grams. :param df: Pyspark dataframe to analyze :param input_col: Column to analyzer. :param n: number of elements per n-gram >=1. :return: Spark DataFrame with n-grams calculated. """ is_dataframe(df) tokenizer = feature.Tokenizer().setInputCol(input_col) | feature.StopWordsRemover() count = feature.CountVectorizer() gram = feature.NGram(n=n) | feature.CountVectorizer() tf = tokenizer | (count, gram) | feature.VectorAssembler() tfidf = tf | feature.IDF().setOutputCol('features') tfidf_model = tfidf.fit(df) df_model = tfidf_model.transform(df) return df_model, tfidf_model
def getTFIDF(closest): grouped_clusters = closest.groupBy("prediction")\ .agg(F.collect_list("split_aspect").alias("text"))\ .withColumn("text", F.concat_ws(" ", "text")) tokenizer = feat.Tokenizer(inputCol="text", outputCol="words") wordsData = tokenizer.transform(grouped_clusters) # get term freqs (using count vectorizer because it does hash the words and we can revert back to words from idx) cv = feat.CountVectorizer(inputCol="words", outputCol="rawFeatures").fit(wordsData) featurizedData = cv.transform(wordsData) # save vocab object vocab = cv.vocabulary # compute idf idf = feat.IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) tfidf = idfModel.transform(featurizedData) return tfidf, vocab
def test_unigram_and_bigram(): df = SPARK_SESSION.sparkContext. \ parallelize([['this is the best sentence ever'], ['this is however the worst sentence available']]). \ toDF(schema=types.StructType().add('sentence', types.StringType())) import requests stop_words = requests.get( 'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words' ).text.split() tokenizer = feature.Tokenizer().setInputCol( 'sentence') | feature.StopWordsRemover(stopWords=stop_words) unigram = feature.CountVectorizer() bigram = feature.NGram() | feature.CountVectorizer() trigram = feature.NGram(n=3) | feature.CountVectorizer() tf = tokenizer | (unigram, bigram, trigram) | feature.VectorAssembler() tfidf = tf | feature.IDF().setOutputCol('features') tfidf_model = tfidf.fit(df) assert_equal( tfidf_model.transform(df).select('sentence', 'features').count(), 2)
def create_vocab(df): """Create a vocabulary from a dataframe. Also removes some special tokens. Args: df: A dataframe with columns'processed_abstract' and 'processed_full_text' Return: vocab: A wordlist sorted by frequency """ concat_udf = F.udf( lambda cols: " ".join([col for col in cols]), spark_types.StringType()) df = df.withColumn( 'all_text', concat_udf(F.array( 'processed_abstract', 'processed_full_text'))) tokenizer = ml_feature.Tokenizer( inputCol='all_text', outputCol='tokens') df = tokenizer.transform(df) cv = ml_feature.CountVectorizer( inputCol='tokens', outputCol='vectors', vocabSize=200000) cv_model = cv.fit(df) # wrd_list is sorted by frequency vocab = cv_model.vocabulary vocab.remove(SENT_START) vocab.remove(SENT_END) vocab.remove(SEC_START) vocab.remove(SEC_END) return vocab