def buildModel(self,save_pipe_path=None): df=self.getModelData() label_index=fea.StringIndexer(inputCol='user_type',outputCol='label') reTokenizer=fea.RegexTokenizer(inputCol='appnames',outputCol='appname_token',pattern=',') cnt_vector=fea.CountVectorizer(inputCol='appname_token',outputCol='appname_vector') vecAssembler = fea.VectorAssembler(inputCols=['appname_vector'], outputCol="feature") scaler=fea.StandardScaler(inputCol='feature',outputCol='features') if not save_pipe_path: lr=LogisticRegression() grid=ParamGridBuilder().addGrid(lr.elasticNetParam,[0,1]).build() evaluator=BinaryClassificationEvaluator(metricName="areaUnderPR") pipeline = Pipeline(stages=[label_index,reTokenizer, cnt_vector,vecAssembler,scaler]) pipe = pipeline.fit(df) pipe_out=pipe.transform(df) cv=CrossValidator(estimator=lr,estimatorParamMaps=grid,evaluator=evaluator) model=cv.fit(pipe_out) print evaluator.evaluate(model.transform(pipe_out)) print 'Best Param (regParam): ', model.bestModel._java_obj.getElasticNetParam() predict_result=model.transform(pipe_out).select('probability','label').toPandas() predict_result.to_csv('/home/chenchen/data/predict_result1.csv',index=False) else: lr=LogisticRegression(elasticNetParam=1.0) pipeline=Pipeline(stages=[label_index,reTokenizer, cnt_vector,vecAssembler,scaler,lr]) model=pipeline.fit(df) model.save(save_pipe_path) print 'pipe saved'
def test_simplepipe(): df = SPARK_SESSION.sparkContext.\ parallelize([['this is a test'], ['this is another test']]).\ toDF(schema=types.StructType().add('sentence', types.StringType())) pl = feature.Tokenizer().setInputCol('sentence') | \ feature.CountVectorizer() | \ feature.IDF() pl_model = pl.fit(df) pl_model.transform(df).count()
def test_ml_pipe(): df = sc. \ parallelize([Row(sentence='this is a test', label=0.), Row(sentence='this is another test', label=1.)]). \ toDF() pl = feature.Tokenizer().setInputCol('sentence') | feature.CountVectorizer() ml = pl | classification.LogisticRegression() ml_model = ml.fit(df) assert_equal(ml_model.transform(df).count(), 2)
def n_gram(df, input_col, n=2): """ Converts the input array of strings inside of a Spark DF into an array of n-grams. :param df: Pyspark dataframe to analyze :param input_col: Column to analyzer. :param n: number of elements per n-gram >=1. :return: Spark DataFrame with n-grams calculated. """ is_dataframe(df) tokenizer = feature.Tokenizer().setInputCol(input_col) | feature.StopWordsRemover() count = feature.CountVectorizer() gram = feature.NGram(n=n) | feature.CountVectorizer() tf = tokenizer | (count, gram) | feature.VectorAssembler() tfidf = tf | feature.IDF().setOutputCol('features') tfidf_model = tfidf.fit(df) df_model = tfidf_model.transform(df) return df_model, tfidf_model
def test_unigram_and_bigram(): df = SPARK_SESSION.sparkContext. \ parallelize([['this is the best sentence ever'], ['this is however the worst sentence available']]). \ toDF(schema=types.StructType().add('sentence', types.StringType())) import requests stop_words = requests.get( 'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words' ).text.split() tokenizer = feature.Tokenizer().setInputCol( 'sentence') | feature.StopWordsRemover(stopWords=stop_words) unigram = feature.CountVectorizer() bigram = feature.NGram() | feature.CountVectorizer() trigram = feature.NGram(n=3) | feature.CountVectorizer() tf = tokenizer | (unigram, bigram, trigram) | feature.VectorAssembler() tfidf = tf | feature.IDF().setOutputCol('features') tfidf_model = tfidf.fit(df) assert_equal( tfidf_model.transform(df).select('sentence', 'features').count(), 2)
def test_stackedml_pipe(): df = SPARK_SESSION.sparkContext. \ parallelize([Row(sentence='this is a test', label=0.), Row(sentence='this is another test', label=1.)]).\ toDF() pl = feature.Tokenizer().setInputCol( 'sentence') | feature.CountVectorizer() ml = pl | (classification.LogisticRegression(),) | feature.VectorAssembler() | \ classification.\ RandomForestClassifier() ml_model = ml.fit(df) assert_equal(ml_model.transform(df).count(), 2)
def logistic_regression_text(df, input_col): """ Runs a logistic regression for input (text) DataFrame. :param df: Pyspark dataframe to analyze :param input_col: Column to predict :return: DataFrame with logistic regression and prediction run. """ assert_spark_df(df) pl = feature.Tokenizer().setInputCol(input_col) | feature.CountVectorizer() ml = pl | classification.LogisticRegression() ml_model = ml.fit(df) df_model = ml_model.transform(df) return df_model, ml_model
def test_multi_model_pipe(): df = SPARK_SESSION.sparkContext. \ parallelize([Row(sentence='this is a test', label=0.), Row(sentence='this is another test', label=1.)]).\ toDF() pl = feature.Tokenizer().setInputCol( 'sentence') | feature.CountVectorizer() models = (classification.LogisticRegression(), classification.RandomForestClassifier(), classification.LogisticRegression().setElasticNetParam(0.2), classification.GBTClassifier()) ml = pl | models | feature.VectorAssembler().setOutputCol('final_features') | \ classification.LogisticRegression() ml_model = ml.fit(df) assert_equal(ml_model.transform(df).count(), 2)
def getTFIDF(closest): grouped_clusters = closest.groupBy("prediction")\ .agg(F.collect_list("split_aspect").alias("text"))\ .withColumn("text", F.concat_ws(" ", "text")) tokenizer = feat.Tokenizer(inputCol="text", outputCol="words") wordsData = tokenizer.transform(grouped_clusters) # get term freqs (using count vectorizer because it does hash the words and we can revert back to words from idx) cv = feat.CountVectorizer(inputCol="words", outputCol="rawFeatures").fit(wordsData) featurizedData = cv.transform(wordsData) # save vocab object vocab = cv.vocabulary # compute idf idf = feat.IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) tfidf = idfModel.transform(featurizedData) return tfidf, vocab
def create_vocab(df): """Create a vocabulary from a dataframe. Also removes some special tokens. Args: df: A dataframe with columns'processed_abstract' and 'processed_full_text' Return: vocab: A wordlist sorted by frequency """ concat_udf = F.udf( lambda cols: " ".join([col for col in cols]), spark_types.StringType()) df = df.withColumn( 'all_text', concat_udf(F.array( 'processed_abstract', 'processed_full_text'))) tokenizer = ml_feature.Tokenizer( inputCol='all_text', outputCol='tokens') df = tokenizer.transform(df) cv = ml_feature.CountVectorizer( inputCol='tokens', outputCol='vectors', vocabSize=200000) cv_model = cv.fit(df) # wrd_list is sorted by frequency vocab = cv_model.vocabulary vocab.remove(SENT_START) vocab.remove(SENT_END) vocab.remove(SEC_START) vocab.remove(SEC_END) return vocab
text_1 = spark.read.format('text').schema(schema).load( '20news-19997/20_newsgroups/alt.atheism/49960.txt') text_2 = spark.read.format('text').schema(schema).load( '20news-19997/20_newsgroups/alt.atheism/51060.txt') text_data = text_1.union(text_2) tokenizer = ft.RegexTokenizer(inputCol='documents', outputCol='input_arr', pattern=r'\s+|[,.\"]') df1 = tokenizer.transform(text_data) stopwords = ft.StopWordsRemover(inputCol='input_arr', outputCol='input_stop') df2 = stopwords.transform(df1) stringIndex = ft.CountVectorizer(inputCol='input_stop', outputCol='input_indexed') cv_model = stringIndex.fit(df2) df3 = cv_model.transform(df2) df3.select('input_stop', 'input_indexed').show(truncate=False) lda = LDA(k=2, maxIter=10, optimizer='em', featuresCol='input_indexed') model = lda.fit(df3) print("vocal size", model.vocabSize()) print(model.topicsMatrix) topics = model.describeTopics() print("The topics described by their top-weighted terms:") topics.show(truncate=False) result = model.transform(df3)
seed = 2020 save_dir = "models" model_dir = "/lr" features = "text" label = "first_label" data_dir = "training_sample" logger.info("Starting Spark Context") spark = sparknlp.start() conf = (pyspark.SparkConf().set("spark.ui.showConsoleProgress", "true")) sc = pyspark.SparkContext.getOrCreate(conf=conf) sqlcontext = pyspark.SQLContext(sc) training_set = (sqlcontext.read.format("parquet").option( "header", True).load(data_dir)) # TF cv = sf.CountVectorizer(inputCol=features, outputCol="tf_features") # IDF idf = sf.IDF(inputCol="tf_features", outputCol="features") # StringIndexer label_string = sf.StringIndexer(inputCol=label, outputCol="label") # Logistic regression lr = LogisticRegression(maxIter=10, family="multinomial") pipeline = Pipeline(stages=[cv, idf, label_string, lr]) paramGrid = (ParamGridBuilder().addGrid(cv.vocabSize, [500, 1000, 1500]).addGrid( lr.regParam, [0.1, 0.01, 0.001]).build())
print(test.count()) # COMMAND ---------- # MAGIC %md #### Train Classifier # COMMAND ---------- from pyspark.ml import feature as spark_ft stopWords = spark_ft.StopWordsRemover.loadDefaultStopWords('english') sw_remover = spark_ft.StopWordsRemover(inputCol='ntokens', outputCol='clean_tokens', stopWords=stopWords) tf = spark_ft.CountVectorizer(vocabSize=500, inputCol='clean_tokens', outputCol='tf') idf = spark_ft.IDF(minDocFreq=5, inputCol='tf', outputCol='idf') feature_pipeline = Pipeline(stages=[sw_remover, tf, idf]) feature_model = feature_pipeline.fit(train) train_featurized = feature_model.transform(train).persist() # COMMAND ---------- display(train_featurized.groupBy("label").count()) # COMMAND ---------- from pyspark.ml import classification as spark_cls
model_dir = "/nn" features = "text" label = "first_label" data_dir = "/home/loic/train/training_sample" logger.info("Starting Spark Context") conf = (pyspark.SparkConf().set("spark.ui.showConsoleProgress", "true")) sc = pyspark.SparkContext.getOrCreate(conf=conf) sqlcontext = pyspark.SQLContext(sc) training_set = (sqlcontext.read.format("parquet").option( "header", True).load(data_dir)) # TF cv = sf.CountVectorizer(inputCol="text", outputCol="tf_features", vocabSize=input_dim) # IDF idf = sf.IDF(inputCol="tf_features", outputCol="features") label_string = sf.StringIndexer(inputCol="first_label", outputCol="label") pipeline_dl = Pipeline(stages=[cv, idf, label_string]) df = pipeline_dl.fit(training_set).transform(training_set) df = df.rdd.map(lambda x: (LabeledPoint(x[ 'label'], MLLibVectors.fromML(x['features'])))) logger.info("Pipeline created ...") logger.info("Transforms the text into tf idf RDD ...") model = create_keras_model(input_dim, output_dim) logger.info("Starts Training ...") spark_model = SparkMLlibModel(model=model, frequency='epoch',