Beispiel #1
0
def test_simplepipe():
    df = SPARK_SESSION.sparkContext.\
        parallelize([['this is a test'], ['this is another test']]).\
        toDF(schema=types.StructType().add('sentence', types.StringType()))

    pl = feature.Tokenizer().setInputCol('sentence') | \
        feature.CountVectorizer() | \
        feature.IDF()
    pl_model = pl.fit(df)
    pl_model.transform(df).count()
Beispiel #2
0
def n_gram(df, input_col, n=2):
    """
    Converts the input array of strings inside of a Spark DF into an array of n-grams.
    :param df: Pyspark dataframe to analyze
    :param input_col: Column to analyzer.
    :param n: number of elements per n-gram >=1.
    :return: Spark DataFrame with n-grams calculated.
    """

    is_dataframe(df)

    tokenizer = feature.Tokenizer().setInputCol(input_col) | feature.StopWordsRemover()
    count = feature.CountVectorizer()
    gram = feature.NGram(n=n) | feature.CountVectorizer()
    tf = tokenizer | (count, gram) | feature.VectorAssembler()
    tfidf = tf | feature.IDF().setOutputCol('features')

    tfidf_model = tfidf.fit(df)
    df_model = tfidf_model.transform(df)
    return df_model, tfidf_model
def getTFIDF(closest):
    grouped_clusters = closest.groupBy("prediction")\
        .agg(F.collect_list("split_aspect").alias("text"))\
        .withColumn("text", F.concat_ws(" ", "text"))

    tokenizer = feat.Tokenizer(inputCol="text", outputCol="words")
    wordsData = tokenizer.transform(grouped_clusters)

    # get term freqs (using count vectorizer because it does hash the words and we can revert back to words from idx)
    cv = feat.CountVectorizer(inputCol="words", outputCol="rawFeatures").fit(wordsData)
    featurizedData = cv.transform(wordsData)

    # save vocab object
    vocab = cv.vocabulary

    # compute idf
    idf = feat.IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    tfidf = idfModel.transform(featurizedData)

    return tfidf, vocab
Beispiel #4
0
def test_unigram_and_bigram():
    df = SPARK_SESSION.sparkContext. \
        parallelize([['this is the best sentence ever'],
                     ['this is however the worst sentence available']]). \
        toDF(schema=types.StructType().add('sentence', types.StringType()))
    import requests
    stop_words = requests.get(
        'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words'
    ).text.split()

    tokenizer = feature.Tokenizer().setInputCol(
        'sentence') | feature.StopWordsRemover(stopWords=stop_words)
    unigram = feature.CountVectorizer()
    bigram = feature.NGram() | feature.CountVectorizer()
    trigram = feature.NGram(n=3) | feature.CountVectorizer()
    tf = tokenizer | (unigram, bigram, trigram) | feature.VectorAssembler()
    tfidf = tf | feature.IDF().setOutputCol('features')

    tfidf_model = tfidf.fit(df)
    assert_equal(
        tfidf_model.transform(df).select('sentence', 'features').count(), 2)
Beispiel #5
0
    features = "text"
    label = "first_label"
    data_dir = "training_sample"
    logger.info("Starting Spark Context")

    spark = sparknlp.start()
    conf = (pyspark.SparkConf().set("spark.ui.showConsoleProgress", "true"))
    sc = pyspark.SparkContext.getOrCreate(conf=conf)
    sqlcontext = pyspark.SQLContext(sc)
    training_set = (sqlcontext.read.format("parquet").option(
        "header", True).load(data_dir))
    # TF
    cv = sf.CountVectorizer(inputCol=features, outputCol="tf_features")

    # IDF
    idf = sf.IDF(inputCol="tf_features", outputCol="features")

    # StringIndexer
    label_string = sf.StringIndexer(inputCol=label, outputCol="label")

    # Logistic regression
    lr = LogisticRegression(maxIter=10, family="multinomial")
    pipeline = Pipeline(stages=[cv, idf, label_string, lr])

    paramGrid = (ParamGridBuilder().addGrid(cv.vocabSize,
                                            [500, 1000, 1500]).addGrid(
                                                lr.regParam,
                                                [0.1, 0.01, 0.001]).build())

    logger.info("Pipeline created ...")
    logger.info("Starts grid search ...")
Beispiel #6
0
    .setStopWords(stopWords) \
    .setCaseSensitive(False)

normalizer = Normalizer() \
    .setInputCols(["stem"]) \
    .setOutputCol("normalized")

finisher = Finisher() \
    .setInputCols(["clean_tokens"]) \
    .setOutputCols(["ntokens"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(True)

nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, stemmer, stopwords, finisher])
nlp_model = nlp_pipeline.fit(df)
processed = nlp_model.transform(df).persist()
processed = processed.withColumn("ntokens", remove_url(F.col("ntokens")))

tf = spark_ft.HashingTF(numFeatures=1 << 16, inputCol='ntokens', outputCol='tf')
idf = spark_ft.IDF(minDocFreq=5, inputCol='tf', outputCol='tfidf')
feature_pipeline = Pipeline(stages=[tf, idf])

feature_model = feature_pipeline.fit(processed)
features = feature_model.transform(processed).persist()
features.show(100, False)
features = features.select('sponsoring_country', 'tweetid', 'userid', 'tweet_text', 'is_validation', 'tfidf')
features.write.parquet('tweets_tfidf.parquet')



label = F.udf(lambda x: 1.0 if x == 'escalate' else 0.0, FloatType())
df = df.withColumn('label', label('label'))


# In[23]:

df.select('label').show()


# In[24]:

import pyspark.ml.feature as feat
TF_ = feat.HashingTF(inputCol="words without stop", 
                     outputCol="rawFeatures", numFeatures=100000)
IDF_ = feat.IDF(inputCol="rawFeatures", outputCol="features")


# In[25]:

pipelineTFIDF = Pipeline(stages=[TF_, IDF_])


# In[26]:

pipelineFit = pipelineTFIDF.fit(df)
df = pipelineFit.transform(df)


# In[27]: