Beispiel #1
0
    def buildModel(self,save_pipe_path=None):
        df=self.getModelData()

        label_index=fea.StringIndexer(inputCol='user_type',outputCol='label')
        reTokenizer=fea.RegexTokenizer(inputCol='appnames',outputCol='appname_token',pattern=',')
        cnt_vector=fea.CountVectorizer(inputCol='appname_token',outputCol='appname_vector')
        vecAssembler = fea.VectorAssembler(inputCols=['appname_vector'], outputCol="feature")
        scaler=fea.StandardScaler(inputCol='feature',outputCol='features')

        if not save_pipe_path:
            lr=LogisticRegression()
            grid=ParamGridBuilder().addGrid(lr.elasticNetParam,[0,1]).build()
            evaluator=BinaryClassificationEvaluator(metricName="areaUnderPR")

            pipeline = Pipeline(stages=[label_index,reTokenizer, cnt_vector,vecAssembler,scaler])
            pipe = pipeline.fit(df)
            pipe_out=pipe.transform(df)

            cv=CrossValidator(estimator=lr,estimatorParamMaps=grid,evaluator=evaluator)
            model=cv.fit(pipe_out)

            print evaluator.evaluate(model.transform(pipe_out))
            print 'Best Param (regParam): ', model.bestModel._java_obj.getElasticNetParam()

            predict_result=model.transform(pipe_out).select('probability','label').toPandas()
            predict_result.to_csv('/home/chenchen/data/predict_result1.csv',index=False)
        else:
            lr=LogisticRegression(elasticNetParam=1.0)

            pipeline=Pipeline(stages=[label_index,reTokenizer, cnt_vector,vecAssembler,scaler,lr])
            model=pipeline.fit(df)

            model.save(save_pipe_path)
            print 'pipe saved'
Beispiel #2
0
def test_simplepipe():
    df = SPARK_SESSION.sparkContext.\
        parallelize([['this is a test'], ['this is another test']]).\
        toDF(schema=types.StructType().add('sentence', types.StringType()))

    pl = feature.Tokenizer().setInputCol('sentence') | \
        feature.CountVectorizer() | \
        feature.IDF()
    pl_model = pl.fit(df)
    pl_model.transform(df).count()
Beispiel #3
0
def test_ml_pipe():
    df = sc. \
         parallelize([Row(sentence='this is a test', label=0.),
                     Row(sentence='this is another test', label=1.)]). \
         toDF()

    pl = feature.Tokenizer().setInputCol('sentence') | feature.CountVectorizer()
    ml = pl | classification.LogisticRegression()

    ml_model = ml.fit(df)
    assert_equal(ml_model.transform(df).count(), 2)
Beispiel #4
0
def n_gram(df, input_col, n=2):
    """
    Converts the input array of strings inside of a Spark DF into an array of n-grams.
    :param df: Pyspark dataframe to analyze
    :param input_col: Column to analyzer.
    :param n: number of elements per n-gram >=1.
    :return: Spark DataFrame with n-grams calculated.
    """

    is_dataframe(df)

    tokenizer = feature.Tokenizer().setInputCol(input_col) | feature.StopWordsRemover()
    count = feature.CountVectorizer()
    gram = feature.NGram(n=n) | feature.CountVectorizer()
    tf = tokenizer | (count, gram) | feature.VectorAssembler()
    tfidf = tf | feature.IDF().setOutputCol('features')

    tfidf_model = tfidf.fit(df)
    df_model = tfidf_model.transform(df)
    return df_model, tfidf_model
Beispiel #5
0
def test_unigram_and_bigram():
    df = SPARK_SESSION.sparkContext. \
        parallelize([['this is the best sentence ever'],
                     ['this is however the worst sentence available']]). \
        toDF(schema=types.StructType().add('sentence', types.StringType()))
    import requests
    stop_words = requests.get(
        'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words'
    ).text.split()

    tokenizer = feature.Tokenizer().setInputCol(
        'sentence') | feature.StopWordsRemover(stopWords=stop_words)
    unigram = feature.CountVectorizer()
    bigram = feature.NGram() | feature.CountVectorizer()
    trigram = feature.NGram(n=3) | feature.CountVectorizer()
    tf = tokenizer | (unigram, bigram, trigram) | feature.VectorAssembler()
    tfidf = tf | feature.IDF().setOutputCol('features')

    tfidf_model = tfidf.fit(df)
    assert_equal(
        tfidf_model.transform(df).select('sentence', 'features').count(), 2)
Beispiel #6
0
def test_stackedml_pipe():
    df = SPARK_SESSION.sparkContext. \
        parallelize([Row(sentence='this is a test', label=0.),
                     Row(sentence='this is another test', label=1.)]).\
        toDF()

    pl = feature.Tokenizer().setInputCol(
        'sentence') | feature.CountVectorizer()
    ml = pl | (classification.LogisticRegression(),) | feature.VectorAssembler() | \
        classification.\
        RandomForestClassifier()

    ml_model = ml.fit(df)
    assert_equal(ml_model.transform(df).count(), 2)
Beispiel #7
0
def logistic_regression_text(df, input_col):
    """
    Runs a logistic regression for input (text) DataFrame.
    :param df: Pyspark dataframe to analyze
    :param input_col: Column to predict
    :return: DataFrame with logistic regression and prediction run.
    """

    assert_spark_df(df)

    pl = feature.Tokenizer().setInputCol(input_col) | feature.CountVectorizer()
    ml = pl | classification.LogisticRegression()
    ml_model = ml.fit(df)
    df_model = ml_model.transform(df)
    return df_model, ml_model
Beispiel #8
0
def test_multi_model_pipe():
    df = SPARK_SESSION.sparkContext. \
        parallelize([Row(sentence='this is a test', label=0.),
                     Row(sentence='this is another test', label=1.)]).\
        toDF()

    pl = feature.Tokenizer().setInputCol(
        'sentence') | feature.CountVectorizer()
    models = (classification.LogisticRegression(),
              classification.RandomForestClassifier(),
              classification.LogisticRegression().setElasticNetParam(0.2),
              classification.GBTClassifier())
    ml = pl | models | feature.VectorAssembler().setOutputCol('final_features') | \
        classification.LogisticRegression()

    ml_model = ml.fit(df)
    assert_equal(ml_model.transform(df).count(), 2)
def getTFIDF(closest):
    grouped_clusters = closest.groupBy("prediction")\
        .agg(F.collect_list("split_aspect").alias("text"))\
        .withColumn("text", F.concat_ws(" ", "text"))

    tokenizer = feat.Tokenizer(inputCol="text", outputCol="words")
    wordsData = tokenizer.transform(grouped_clusters)

    # get term freqs (using count vectorizer because it does hash the words and we can revert back to words from idx)
    cv = feat.CountVectorizer(inputCol="words", outputCol="rawFeatures").fit(wordsData)
    featurizedData = cv.transform(wordsData)

    # save vocab object
    vocab = cv.vocabulary

    # compute idf
    idf = feat.IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    tfidf = idfModel.transform(featurizedData)

    return tfidf, vocab
def create_vocab(df):
	"""Create a vocabulary from a dataframe.
	Also removes some special tokens.
	
	Args:
		df: A dataframe with columns'processed_abstract'
		 and 'processed_full_text'

	Return:
		vocab: A wordlist sorted by frequency
	"""
	concat_udf = F.udf(
		lambda cols: " ".join([col for col in cols]),
		spark_types.StringType())
	df = df.withColumn(
		'all_text',
		concat_udf(F.array(
			'processed_abstract',
			'processed_full_text')))
	tokenizer = ml_feature.Tokenizer(
		inputCol='all_text',
		outputCol='tokens')
	df = tokenizer.transform(df)
	cv = ml_feature.CountVectorizer(
		inputCol='tokens',
		outputCol='vectors',
		vocabSize=200000)
	cv_model = cv.fit(df)

	# wrd_list is sorted by frequency
	vocab = cv_model.vocabulary
	vocab.remove(SENT_START)
	vocab.remove(SENT_END)
	vocab.remove(SEC_START)
	vocab.remove(SEC_END)

	return vocab
Beispiel #11
0
text_1 = spark.read.format('text').schema(schema).load(
    '20news-19997/20_newsgroups/alt.atheism/49960.txt')
text_2 = spark.read.format('text').schema(schema).load(
    '20news-19997/20_newsgroups/alt.atheism/51060.txt')

text_data = text_1.union(text_2)

tokenizer = ft.RegexTokenizer(inputCol='documents',
                              outputCol='input_arr',
                              pattern=r'\s+|[,.\"]')
df1 = tokenizer.transform(text_data)

stopwords = ft.StopWordsRemover(inputCol='input_arr', outputCol='input_stop')
df2 = stopwords.transform(df1)

stringIndex = ft.CountVectorizer(inputCol='input_stop',
                                 outputCol='input_indexed')
cv_model = stringIndex.fit(df2)

df3 = cv_model.transform(df2)
df3.select('input_stop', 'input_indexed').show(truncate=False)

lda = LDA(k=2, maxIter=10, optimizer='em', featuresCol='input_indexed')
model = lda.fit(df3)
print("vocal size", model.vocabSize())
print(model.topicsMatrix)

topics = model.describeTopics()
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

result = model.transform(df3)
Beispiel #12
0
    seed = 2020
    save_dir = "models"
    model_dir = "/lr"
    features = "text"
    label = "first_label"
    data_dir = "training_sample"
    logger.info("Starting Spark Context")

    spark = sparknlp.start()
    conf = (pyspark.SparkConf().set("spark.ui.showConsoleProgress", "true"))
    sc = pyspark.SparkContext.getOrCreate(conf=conf)
    sqlcontext = pyspark.SQLContext(sc)
    training_set = (sqlcontext.read.format("parquet").option(
        "header", True).load(data_dir))
    # TF
    cv = sf.CountVectorizer(inputCol=features, outputCol="tf_features")

    # IDF
    idf = sf.IDF(inputCol="tf_features", outputCol="features")

    # StringIndexer
    label_string = sf.StringIndexer(inputCol=label, outputCol="label")

    # Logistic regression
    lr = LogisticRegression(maxIter=10, family="multinomial")
    pipeline = Pipeline(stages=[cv, idf, label_string, lr])

    paramGrid = (ParamGridBuilder().addGrid(cv.vocabSize,
                                            [500, 1000, 1500]).addGrid(
                                                lr.regParam,
                                                [0.1, 0.01, 0.001]).build())
Beispiel #13
0
print(test.count())

# COMMAND ----------

# MAGIC %md #### Train Classifier

# COMMAND ----------

from pyspark.ml import feature as spark_ft

stopWords = spark_ft.StopWordsRemover.loadDefaultStopWords('english')
sw_remover = spark_ft.StopWordsRemover(inputCol='ntokens',
                                       outputCol='clean_tokens',
                                       stopWords=stopWords)
tf = spark_ft.CountVectorizer(vocabSize=500,
                              inputCol='clean_tokens',
                              outputCol='tf')
idf = spark_ft.IDF(minDocFreq=5, inputCol='tf', outputCol='idf')

feature_pipeline = Pipeline(stages=[sw_remover, tf, idf])
feature_model = feature_pipeline.fit(train)

train_featurized = feature_model.transform(train).persist()

# COMMAND ----------

display(train_featurized.groupBy("label").count())

# COMMAND ----------

from pyspark.ml import classification as spark_cls
Beispiel #14
0
    model_dir = "/nn"
    features = "text"
    label = "first_label"
    data_dir = "/home/loic/train/training_sample"

    logger.info("Starting Spark Context")

    conf = (pyspark.SparkConf().set("spark.ui.showConsoleProgress", "true"))
    sc = pyspark.SparkContext.getOrCreate(conf=conf)
    sqlcontext = pyspark.SQLContext(sc)
    training_set = (sqlcontext.read.format("parquet").option(
        "header", True).load(data_dir))

    # TF
    cv = sf.CountVectorizer(inputCol="text",
                            outputCol="tf_features",
                            vocabSize=input_dim)
    # IDF
    idf = sf.IDF(inputCol="tf_features", outputCol="features")
    label_string = sf.StringIndexer(inputCol="first_label", outputCol="label")
    pipeline_dl = Pipeline(stages=[cv, idf, label_string])
    df = pipeline_dl.fit(training_set).transform(training_set)
    df = df.rdd.map(lambda x: (LabeledPoint(x[
        'label'], MLLibVectors.fromML(x['features']))))
    logger.info("Pipeline created ...")
    logger.info("Transforms the text into tf idf RDD ...")
    model = create_keras_model(input_dim, output_dim)

    logger.info("Starts Training ...")
    spark_model = SparkMLlibModel(model=model,
                                  frequency='epoch',