Esempi in Python per HashingTF.HashingTF, esempi in Python per pyspark.ml.feature.HashingTF.HashingTF

Esempio n. 1

0

Mostra file

File: SparkProcess_html.py Progetto: praneshvyas11/evolveML

def main(args):
    textFiles = sc.wholeTextFiles(maindir + '4').map(readContents)
    #print "READ second {} check ".format(textFiles.take(10))
    '''
        filter the rows based on all the index available in
        training file else drop
        http://stackoverflow.com/questions/24718697/pyspark-drop-rows
    '''

    htmldf = sqlContext.createDataFrame(textFiles)
    htmldf.cache()


    traindf = getCleanedRDD(maindir + 'train_v2.csv', ["id", "images", "links", "text", "label"], htmldf)
    traindf.write.save(maindir+"output/train_4.parquet", format="parquet")



    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=20, regParam=0.01)
    rf = GBTClassifier(maxIter=30, maxDepth=4, labelCol="label")
    rf = RandomForestClassifier(labelCol="features", numTrees=3, maxDepth=4)
    #https://databricks.com/blog/2015/07/29/new-features-in-machine-learning-pipelines-in-spark-1-4.html
    #http://spark.apache.org/docs/latest/api/python/pyspark.ml.html

    #w2v = Word2Vec(inputCol="text", outputCol="w2v")

    rfc = RandomForestClassifier(labelCol="label", numTrees=3, maxDepth=4)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])



    # Fit the pipeline to training documents.
    model = pipeline.fit(traindf)

    print '-----------------------------------------------------------------------------'
    testdf = getCleanedRDD(maindir + 'test.csv', ["id", "images", "links", "text", "label"], htmldf)
    #print testdf.count()



    # Make predictions on test documents and print columns of interest.
    prediction = model.transform(testdf)
    #print('prediction', prediction)

    '''
    pand = prediction.toPandas()
    pand.to_csv('testpanda.csv', sep='\t', encoding='utf-8')	
    print "Done!!! CSV"

    '''
    #prediction.select('id','probability','prediction').write.format('com.databricks.spark.csv').option("header", "true").save(maindir + 'output/result_lr0.csv')
    # ('prediction', DataFrame[id: string, images: bigint, links: bigint, text: string, label: double,
    # words: array<string>, features: vector, rawPrediction: vector, probability: vector, prediction: double])

    '''
    #write in scala
    selected = prediction.select("id", "probability", "prediction")
    for row in selected.collect():
        print row
    '''
    sc.stop()

Esempio n. 2

0

Mostra file

from pyspark.ml.feature import HashingTF, IDF, Tokenizer

#spark-submit --master local[*] --packages com.databricks:spark-csv_2.10:1.2.0 cluster.py

sc = SparkContext()
sqlContext = SQLContext(sc)
text = sc.textFile('file:/Users/wangmengyuan/Desktop/rr/listings.txt').map(lambda l:l.split('\t'))\
 .map(lambda l: (l[0],l[1]))
df = sqlContext.createDataFrame(text, ["houseid", "description"])
tokenizer = Tokenizer(inputCol="description", outputCol="tokens")
tokenized = tokenizer.transform(df).cache()
remover = StopWordsRemover(inputCol="tokens",
                           outputCol="stopWordsRemovedTokens")
stopWordsRemoved_df = remover.transform(tokenized).cache()
hashingTF = HashingTF(inputCol="stopWordsRemovedTokens",
                      outputCol="rawFeatures",
                      numFeatures=200)
tfVectors = hashingTF.transform(stopWordsRemoved_df).cache()
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)
idfModel = idf.fit(tfVectors)
tfIdfVectors = idfModel.transform(tfVectors).cache()
normalizer = Normalizer(inputCol="features", outputCol="normFeatures")
l2NormData = normalizer.transform(tfIdfVectors)
kmeans = KMeans().setK(10).setMaxIter(20)
km_model = kmeans.fit(l2NormData)
clustersTable = km_model.transform(l2NormData)

#save to hdfs
df1 = clustersTable[['houseid', 'prediction']]
#df1.select('houseid', 'prediction').write.format('com.databricks.spark.csv').save('cluster.csv')
df1.select('houseid', 'prediction').show(20)

Esempio n. 3

0

Mostra file

File: c9898.py Progetto: BenitaDiop/FullStackBigData-with-SPARK


from pyspark.ml.feature import CountVectorizer

count_vec=CountVectorizer(inputCol='refined_tokens',outputCol='features')

cv_df=count_vec.fit(refined_df).transform(refined_df)

cv_df.select(['user_id',"business_id", "review_id", 'refined_tokens','features']).show(1,True, True)

count_vec.fit(refined_df).vocabulary


from pyspark.ml.feature import HashingTF,IDF

hashing_vec=HashingTF(inputCol='refined_tokens',outputCol='tf_features')

hashing_df=hashing_vec.transform(refined_df)

hashing_df.select(['user_id','refined_tokens','tf_features']).show(4,True, True)

tf_idf_vec=IDF(inputCol='tf_features',outputCol='tf_idf_features')

tf_idf_df=tf_idf_vec.fit(hashing_df).transform(hashing_df)

tf_idf_df.select('tf_idf_features').show(1,True, True)
tf_idf_df.show(1, True, True)

def get_dummy(df, indexCol, categoricalCols,
              continuousCols, labelCol, dropLast=False):
    from pyspark.ml import Pipeline

Esempio n. 4

0

Mostra file

File: ham_or_spam_gbm.py Progetto: sonyeric/sparkling-water

tokenizer = RegexTokenizer(inputCol="text",
                           outputCol="words",
                           minTokenLength=3,
                           gaps=False,
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(
    inputCol=tokenizer.getOutputCol(),
    outputCol="filtered",
    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4)

## Create GBM model
gbm = H2OGBM(ratio=0.8,
             featuresCols=[idf.getOutputCol()],
             predictionCol="label")

## Remove all helper columns
colPruner = ColumnPruner(columns=[
    idf.getOutputCol(),
    hashingTF.getOutputCol(),
    stopWordsRemover.getOutputCol(),

Esempio n. 5

0

Mostra file

# MAGIC %md ### Define the Pipeline
# MAGIC The pipeline for the model consist of the following stages:
# MAGIC - A Tokenizer to split the tweets into individual words.
# MAGIC - A StopWordsRemover to remove common words such as "a" or "the" that have little predictive value.
# MAGIC - A HashingTF class to generate numeric vectors from the text values.
# MAGIC - A LogisticRegression algorithm to train a binary classification model.

# COMMAND ----------

# convert sentence to words' list
tokenizer = Tokenizer(inputCol="text", outputCol="SentimentWords")
# remove stop words
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                       outputCol="MeaningfulWords")
# convert word to number as word frequency
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
# set the model
lr = LogisticRegression(labelCol="label",
                        featuresCol="features",
                        maxIter=10,
                        regParam=0.01)

# process pipeline with the series of transforms - 4 transforms
pipeline = Pipeline(stages=[tokenizer, swr, hashTF, lr])

# COMMAND ----------

# MAGIC %md ### Run the Pipeline as an Estimator
# MAGIC The pipeline itself is an estimator, and so it has a **fit** method that we called to run the pipeline on a specified DataFrame. In this case, we ran the pipeline on the training data to train a model.

# COMMAND ----------

Esempio n. 6

0

Mostra file

File: tf_idf_q1_c.py Progetto: charantej224/KDM_Course_ICP

spark = SparkSession.builder.appName("TfIdf-Ngram").getOrCreate()
documents = spark.read.text("dataset/*.txt")
documents = documents.withColumn("doc_id",
                                 F.row_number().over(Window.orderBy('value')))

documents.printSchema()
# creating tokens/words from the sentence data
tokenizer = Tokenizer(inputCol="value", outputCol="words")
wordsData = tokenizer.transform(documents)

ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
ngramDataFrame = ngram.transform(wordsData)

# applying tf on the words data
hashingTF = HashingTF(inputCol="ngrams",
                      outputCol="rawFeatures",
                      numFeatures=20)
featurizedData = hashingTF.transform(ngramDataFrame)
# alternatively, CountVectorizer can also be used to get term frequency vectors

# calculating the IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

# displaying the results
rescaledData.select("doc_id", "features").show(truncate=False)
# closing the spark session
spark.stop()

Esempio n. 7

0

Mostra file

File: app.py Progetto: JoyceJiang73/DSCI551_PolReddit

def getvalue3():
    if request.method == 'POST':
        subreddit_input = request.form['subreddit']
        #subreddit_input = 'World Politics'
        subreddit_filter = requests.get(
            url + 'reddit_post.json?orderBy="subreddit"&equalTo="' +
            str(subreddit_input) + '"')
        subreddits = json.loads(subreddit_filter.text)
        results = []
        for x in subreddits:
            try:
                results.append(subreddits[x])
            except KeyError:
                continue
        data = pd.DataFrame.from_dict(results, orient='columns')
        data1 = spark.createDataFrame(pd.DataFrame(data["title"]))
        data1.show(truncate=False)
        clean_data_udf = udf(clean_data, StringType())
        data1 = data1.withColumn("new_title", clean_data_udf("title"))
        data1.show()
        tokenizer = Tokenizer(inputCol="new_title", outputCol="words")
        data1 = tokenizer.transform(data1)
        data1.show()
        remover = StopWordsRemover(inputCol="words", outputCol="rm_words")
        data1 = remover.transform(data1)
        data1.show()
        hashingTF = HashingTF(inputCol="rm_words",
                              outputCol="rawFeatures",
                              numFeatures=2000)
        data1 = hashingTF.transform(data1)
        data1.show()
        data1.select("rm_words").show(truncate=False)
        data1.select("rawFeatures").show(truncate=False)
        idf = IDF(inputCol="rawFeatures", outputCol="features")
        idfModel = idf.fit(data1)
        data1 = idfModel.transform(data1)
        data1.select("features").show(truncate=False)
        kmeans = KMeans(k=2, featuresCol="features").setSeed(1)
        kmeans_model = kmeans.fit(data1)
        data1 = kmeans_model.transform(data1)
        data1.select("prediction").show(50)
        data["prediction"] = data1.select("prediction").toPandas()
        print(data["prediction"].value_counts())

        #topic_generator(subreddit_input)
        topic1 = data[data['prediction'] == 0]['title'].reset_index(drop=True)
        topic2 = data[data['prediction'] == 1]['title'].reset_index(drop=True)
        topic1_1 = topic1[0]
        topic1_2 = topic1[1]
        topic1_3 = topic1[2]
        topic1_4 = topic1[3]
        topic1_5 = topic1[4]
        topic2_1 = topic2[0]
        topic2_2 = topic2[1]
        topic2_3 = topic2[2]
        topic2_4 = topic2[3]
        topic2_5 = topic2[4]
        return render_template('title_topic_update.html',
                               topic1_1=topic1_1,
                               topic1_2=topic1_2,
                               topic1_3=topic1_3,
                               topic1_4=topic1_4,
                               topic1_5=topic1_5,
                               topic2_1=topic2_1,
                               topic2_2=topic2_2,
                               topic2_3=topic2_3,
                               topic2_4=topic2_4,
                               topic2_5=topic2_5)
    else:
        return render_template('title_topic.html')

Esempio n. 8

0

Mostra file

File: nlp_feature_engineering.py Progetto: FrankFang0813/Spark_Note

    # 變成n字一組
    # NGram is a feature transformer that converts the input array of strings into an array of n-grams. Null values in the input array are ignored. It returns an array of n-grams where each n-gram is represented by a space-separated string of words. When the input is empty, an empty array is returned. When the input array length is less than n (number of elements per n-gram), no n-grams are returned.
    ngram_df = NGram(n=2, inputCol="words",
                     outputCol="ngrams").transform(words)

    ngram_df.show(truncate=False)
    ngram_df.select("ngrams").show(truncate=False)

    # TF-IDF is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.[1] It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling.
    df = words.select("words")
    df.show(truncate=False)

    # Hashing TF is TF with hashing enabled to allow the feature vector to be a set value
    df_tf = HashingTF(
        inputCol="words",
        outputCol="hashing_tf",
        numFeatures=15  #預設是262144維
    ).transform(df)

    df_tf.show()
    df_tf.select("words").show(truncate=False)
    df_tf.select("hashing_tf").show(truncate=False)
    #第一個list代表詞的index，第2個list代表詞出現次數

    # IDF
    df_tf_idf = IDF(inputCol="hashing_tf",
                    outputCol="tf_idf").fit(df_tf).transform(df_tf)

    df_tf_idf.show()
    df_tf_idf.select("words").show(truncate=False)
    df_tf_idf.select("hashing_tf").show(truncate=False)  # Hashing TF

Esempio n. 9

0

Mostra file

File: test_tuning.py Progetto: xinyiZzz/spark

    def test_save_load_pipeline_estimator(self):
        temp_path = tempfile.mkdtemp()
        training = self.spark.createDataFrame([
            (0, "a b c d e spark", 1.0),
            (1, "b d", 0.0),
            (2, "spark f g h", 1.0),
            (3, "hadoop mapreduce", 0.0),
            (4, "b spark who", 1.0),
            (5, "g d a y", 0.0),
            (6, "spark fly", 1.0),
            (7, "was mapreduce", 0.0),
        ], ["id", "text", "label"])

        # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                              outputCol="features")

        ova = OneVsRest(classifier=LogisticRegression())
        lr1 = LogisticRegression().setMaxIter(5)
        lr2 = LogisticRegression().setMaxIter(10)

        pipeline = Pipeline(stages=[tokenizer, hashingTF, ova])

        paramGrid = ParamGridBuilder() \
            .addGrid(hashingTF.numFeatures, [10, 100]) \
            .addGrid(ova.classifier, [lr1, lr2]) \
            .build()

        tvs = TrainValidationSplit(
            estimator=pipeline,
            estimatorParamMaps=paramGrid,
            evaluator=MulticlassClassificationEvaluator())
        tvsPath = temp_path + "/tvs"
        tvs.save(tvsPath)
        loadedTvs = TrainValidationSplit.load(tvsPath)
        self.assert_param_maps_equal(loadedTvs.getEstimatorParamMaps(),
                                     paramGrid)
        self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)

        # Run train validation split, and choose the best set of parameters.
        tvsModel = tvs.fit(training)

        # test save/load of CrossValidatorModel
        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
        self.assertEqual(len(loadedModel.bestModel.stages),
                         len(tvsModel.bestModel.stages))
        for loadedStage, originalStage in zip(loadedModel.bestModel.stages,
                                              tvsModel.bestModel.stages):
            self.assertEqual(loadedStage.uid, originalStage.uid)

        # Test nested pipeline
        nested_pipeline = Pipeline(
            stages=[tokenizer, Pipeline(stages=[hashingTF, ova])])
        tvs2 = TrainValidationSplit(
            estimator=nested_pipeline,
            estimatorParamMaps=paramGrid,
            evaluator=MulticlassClassificationEvaluator())
        tvs2Path = temp_path + "/tvs2"
        tvs2.save(tvs2Path)
        loadedTvs2 = TrainValidationSplit.load(tvs2Path)
        self.assert_param_maps_equal(loadedTvs2.getEstimatorParamMaps(),
                                     paramGrid)
        self.assertEqual(loadedTvs2.getEstimator().uid,
                         tvs2.getEstimator().uid)

        # Run train validation split, and choose the best set of parameters.
        tvsModel2 = tvs2.fit(training)
        # test save/load of CrossValidatorModel
        tvsModelPath2 = temp_path + "/tvsModel2"
        tvsModel2.save(tvsModelPath2)
        loadedModel2 = TrainValidationSplitModel.load(tvsModelPath2)
        self.assertEqual(loadedModel2.bestModel.uid, tvsModel2.bestModel.uid)
        loaded_nested_pipeline_model = loadedModel2.bestModel.stages[1]
        original_nested_pipeline_model = tvsModel2.bestModel.stages[1]
        self.assertEqual(loaded_nested_pipeline_model.uid,
                         original_nested_pipeline_model.uid)
        self.assertEqual(len(loaded_nested_pipeline_model.stages),
                         len(original_nested_pipeline_model.stages))
        for loadedStage, originalStage in zip(
                loaded_nested_pipeline_model.stages,
                original_nested_pipeline_model.stages):
            self.assertEqual(loadedStage.uid, originalStage.uid)

Esempio n. 10

0

Mostra file

train_df.fillna('', inplace = True)
train_df = spark.createDataFrame(train_df)
test_df = pd.read_csv('test.csv')
test_df.fillna('', inplace = True)
test_df = spark.createDataFrame(test_df)


out_cols = [i for i in train_df.columns if i not in ['id', 'comment_text']]
#
train_df.filter(F.col('toxic') == 1).show(5)

#Tokenizer:
tokenizer = Tokenizer(inputCol = 'comment_text', outputCol = 'words')
words_data = tokenizer.transform(train_df)
#
hashing_tf = HashingTF(inputCol = 'words', outputCol = 'rawFeatures')
tf = hashing_tf.transform(words_data)

tf.select('rawFeatures').take(5)

tf.count(), len(tf.columns)

idf = IDF(inputCol = 'rawFeatures', outputCol = 'features')
idfModel = idf.fit(tf)
tf_idf = idfModel.transform(tf)

####
#Performing the logistic regression:
REG = 0.01
lr = LogisticRegression(featuresCol = 'features', labelCol = 'toxic',
	regParam = REG)

Esempio n. 11

0

Mostra file

File: naics.py Progetto: NataliiaNV/bigdata19itea2.case01

def main():

    spark = SQLContext(SparkContext.getOrCreate())

    # read data
    yahoo = spark.read.csv(f'{BUILDDIR}/yahoo.csv', header=True)
    data = yahoo.select(['sector', 'description']).dropna()

    # tokenize texts based on regular expression
    tokenize = RegexTokenizer(inputCol='description',
                              outputCol='words_all',
                              pattern=r'\W')

    # remove stop words
    stopwords = '\n'.join((DATADIR / 'stopwords' / f).read_text().strip()
                          for f in ('mysql.txt', 'nltk.txt')).splitlines()
    remove_stopwords = StopWordsRemover(
        inputCol='words_all', outputCol='words_clean').setStopWords(stopwords)

    # get words frequency using simple count (bag of words)
    add_wordcount = CountVectorizer(inputCol='words_clean',
                                    outputCol='words_count',
                                    vocabSize=1000,
                                    minDF=2)

    # get tf-idf words frequencies
    add_wordtf = HashingTF(inputCol='words_clean',
                           outputCol='words_tf',
                           numFeatures=10000)
    add_wordidf = IDF(inputCol='words_tf',
                      outputCol='words_tfidf',
                      minDocFreq=2)

    # prepare output values
    index_target = StringIndexer(inputCol='sector', outputCol='label')

    # data preparation pipeline
    pipeline_wordcount = Pipeline(stages=[
        tokenize,
        remove_stopwords,
        add_wordcount,
        add_wordtf,
        add_wordidf,
        index_target,
    ])
    # apply data preparation pipeline
    model_wordcount = pipeline_wordcount.fit(data)
    prepared = model_wordcount.transform(data)

    breakpoint()

    # split to training and testing
    training, testing = prepared.randomSplit([0.8, 0.2], seed=100500)

    # fit logistic regression models

    logistic_wordcount = LogisticRegression(regParam=0.3,
                                            elasticNetParam=0,
                                            featuresCol='words_count',
                                            labelCol='label',
                                            predictionCol='prediction',
                                            probabilityCol='probability')

    logistic_tfidf = LogisticRegression(regParam=0.3,
                                        elasticNetParam=0,
                                        featuresCol='words_tfidf',
                                        labelCol='label',
                                        predictionCol='prediction',
                                        probabilityCol='probability')

    evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                                  metricName='accuracy')
    for model, name in ((logistic_wordcount,
                         'Word count + Logistic regression'),
                        (logistic_tfidf, 'TF-IDF + Logistic regression')):
        predicted = model.fit(training).transform(testing)
        print(f'{name} model accuracy = {evaluator.evaluate(predicted)}')

Esempio n. 12

0

Mostra file

A quick reminder about these concepts:

The hashing trick provides a fast and space-efficient way to map a very large (possibly infinite) set of items (in this case, all words contained in the SMS messages) onto a smaller, finite number of values.
The TF-IDF matrix reflects how important a word is to each document. It takes into account both the frequency of the word within each document but also the frequency of the word across all of the documents in the collection.
The tokenized SMS data are stored in sms in a column named words. You've cleaned up the handling of spaces in the data so that the tokenized text is neater.

Instructions
100 XP
Import the StopWordsRemover, HashingTF and IDF classes.
Create a StopWordsRemover object (input column words, output column terms). Apply to sms.
Create a HashingTF object (input results from previous step, output column hash). Apply to wrangled.
Create an IDF object (input results from previous step, output column features). Apply to wrangled.
'''
SOLUTION

from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF

# Remove stop words.
wrangled = StopWordsRemover(inputCol='words', outputCol='terms')\
      .transform(sms)

# Apply the hashing trick
wrangled = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024)\
      .transform(wrangled)

# Convert hashed symbols to TF-IDF
tf_idf = IDF(inputCol='hash', outputCol='features')\
      .fit(wrangled).transform(wrangled)

tf_idf.select('terms', 'features').show(4, truncate=False)

Esempio n. 13

0

Mostra file

###############################################################################################
# Pipeline
###############################################################################################
# Tokenize by word
tokenizer = Tokenizer(inputCol="text", outputCol="words")
# Remove stop words in the text
stopword = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                            outputCol="no_stops",
                            stopWords=swords)
# The cheaper way to do TF-IDF
# Creates a hash that contains the term frequency
# This mean there are no pairs with the value 0
# It'll output: (number_of_words {index_from_previous: value, ...}) with no value = 0
# If the value is 0, the index_from_previous will skip so there can be key that go
# 0, 1, 6, 8, ... etc all based on the contents of the previous step
hashingTF = HashingTF(inputCol=stopword.getOutputCol(), outputCol="hashing")
# Performs the IDF part in TF-IDF
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="features1",
          minDocFreq=5)
# Appends output Token-Stopwords-HashingTF-IDF with output of Vader
assembler = VectorAssembler(inputCols=["features1", "vader"],
                            outputCol="features")
# Initialize Logistic Regression
lr = LogisticRegression(maxIter=10, regParam=0.001)
# Creates pipeline
pipeline = Pipeline(
    stages=[tokenizer, stopword, hashingTF, idf, assembler, lr])

###############################################################################################
# Fit model to training set

Esempio n. 14

0

Mostra file

File: main_firstTry_cluster.py Progetto: mathieu-lechine/sentiment_analysis_with_spark

# Pour les opérations de traitement du langage, il est d'usage de normaliser (L2)
# les vecteurs de features : c'est ce qui marche le mieux apparemment.
from pyspark.ml.feature import Normalizer
normalizerUni = Normalizer(inputCol='words', outputCol='normWords', p=2.0)
normalizerBi = Normalizer(inputCol="bigrams", outputCol='normBigrams', p=2.0)
dfNorm = normalizerUni.transform(dfVect2)
dfNorm2 = normalizerBi.transform(dfNorm)
print "DataFrame(bi-gram): normalisé"
dfNorm2.select('words', 'normWords').show()
# La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements
# non nuls et pas leur valeur
# On passe au TFIDF
# Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs
# à n'importz quelle colonne (bigrammes, avec stop words ou sans...)
from pyspark.ml.feature import HashingTF
htf = HashingTF(inputCol='words', outputCol='wordsTF', numFeatures=10000)
dfTrainTF = htf.transform(dfTrainTokNoSw)
# INverse doc frequency
from pyspark.ml.feature import IDF
idf = IDF(inputCol=htf.getOutputCol(), outputCol="wordsTFIDF")
idfModel = idf.fit(dfTrainTF)
dfTrainTFIDF = idfModel.transform(dfTrainTF)
dfTrainTFIDF.select('review', 'wordsTF', 'wordsTFIDF').show()

# Je sais que cette étape m'a été utile une fois, la ça a pas trop l'air
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainTFIDF)
dfTrainFinal = string_indexer_model.transform(dfTrainTFIDF)
dfTrainFinal.select('review', 'label', 'target_indexed').show()

Esempio n. 15

0

Mostra file

File: script1_generate_txt.py Progetto: mathieu-lechine/sentiment_analysis_with_spark

data,Y=lf.loadLabeled("./data/train")
labeledData = zip(data,[y.item() for y in Y])
labeledRdd = sc.parallelize(labeledData)
def cleanLower(doc):
    return doc.replace("<br /><br />"," ").lower()
rdd = labeledRdd.map(lambda doc : (cleanLower(doc[0]),doc[1]))

print "Text is cleaned"

sqlContext = SQLContext(sc)
dfTrain = sqlContext.createDataFrame(rdd, ['review', 'label'])

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review", outputCol="wordsNoSw",  
    stopwords=set(nltk.corpus.stopwords.words('english')))
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)

pipeline = Pipeline(stages=[tokenizerNoSw,
                            hashing_tf,
                            idf,
                            string_indexer,
                            dt])


model = pipeline.fit(dfTrain)

print "The model is fitted"

Esempio n. 16

0

Mostra file

File: train_on_dbr.py Progetto: ragpan29/Azure

# COMMAND ----------

# MAGIC %md
# MAGIC ### Prepare the Pipeline
# MAGIC For compataibility with Azure Model Management, make sure you are training the model on a cluster with Spark less than 2.3.0 since Model Management runs on Spark 2.1.1 and the Linear Regression model has a new param (epsilon) added in 2.3.0.

# COMMAND ----------

tkn = Tokenizer().setInputCol("abstract").setOutputCol("tokens")

englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover().setStopWords(englishStopWords).setInputCol(
    "tokens").setOutputCol("tokens_no_stop")

tf = HashingTF().setInputCol("tokens_no_stop").setOutputCol(
    "TFOut").setNumFeatures(1000)
idf = IDF().setInputCol("TFOut").setOutputCol("IDFOut").setMinDocFreq(1)
assem = VectorAssembler().setInputCols(["TFOut"]).setOutputCol("features")
rename = SQLTransformer().setStatement(
    "SELECT features, amt as label FROM __THIS__")
reg = LinearRegression()

pipe = Pipeline().setStages([tkn, stops, tf, idf, assem, rename, reg])

# COMMAND ----------

# MAGIC %md
# MAGIC ### Fit the Pipeline

# COMMAND ----------

Esempio n. 17

0

Mostra file

File: Summarize reviews.py Progetto: DhineshVijayakumar/amazon-review-hackondata2017


# ## Learning pipeline

# In[8]:

from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StopWordsRemover
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.classification import LogisticRegression

tokenizer = Tokenizer(inputCol='summary', outputCol='words')

pipeline = Pipeline(stages=[
    tokenizer, 
    StopWordsRemover(inputCol='words', outputCol='filtered_words'),
    HashingTF(inputCol='filtered_words', outputCol='rawFeatures', numFeatures=120000),
    IDF(inputCol='rawFeatures', outputCol='features'),
    LogisticRegression(regParam=.3, elasticNetParam=.01)
])


# ## Testing the model accuracy

# In[9]:

model = pipeline.fit(train_reviews)


# In[10]:

from pyspark.ml.evaluation import BinaryClassificationEvaluator

Esempio n. 18

0

Mostra file

File: datasetTraining.py Progetto: jzena001/CS179G

from pyspark.ml.evaluation import BinaryClassificationEvaluator

import sys
import string

spark = SparkSession.builder\
        .appName("datasetTraining")\
        .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

dataset = spark.read.csv('/bherr006/datasetTraining/training.1600000.processed.noemoticon.csv', header=False, inferSchema=True)

(trainSet, valSet, testSet) = dataset.randomSplit([0.98, 0.01, 0.01], seed = 2000)

tokenizer = Tokenizer(inputCol="_c5", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol="tf")
idf = IDF(inputCol="tf", outputCol="features", minDocFreq=5)
labelStringIndex = StringIndexer(inputCol="_c0", outputCol = "label")
pipeline = Pipeline(stages=[tokenizer,hashtf,idf,labelStringIndex])

pipelineFit = pipeline.fit(trainSet)
trainDf = pipelineFit.transform(trainSet)
valDf = pipelineFit.transform(valSet)

lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(trainDf)
predictions = lrModel.transform(valDf)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print(evaluator.evaluate(predictions))

Esempio n. 19

0

Mostra file

File: textdecisiontreespark.py Progetto: KonanPrevail/code

# tokenizer to create a "terms" column so for example:
# from content=u'We start learning Spark'  we have terms=[u'we', u'start', u'learning', u'spark']
tokenizer = Tokenizer(inputCol="content", outputCol="terms")
termsData = tokenizer.transform(data)

# remover to remove stop words that don't contribute so for example
# from terms=[u'we', u'start', u'learning', u'spark'] we have filtered=[u'start', u'learning', u'spark']
remover = StopWordsRemover(inputCol="terms", outputCol="filtered")
filteredTermsData = remover.transform(termsData)

# http://spark.apache.org/docs/latest/ml-features.html
# Both HashingTF and CountVectorizer can be used to generate the term frequency vectors.
# HashingTF is a Transformer which takes sets of terms and converts those sets into fixed-length feature vectors. In text processing, a “set of terms” might # be a bag of words. HashingTF utilizes the hashing trick.
# so from filtered=[u'start', u'learning', u'spark'] we have rawFeatures=SparseVector(262144, {29470: 1.0, 62173: 1.0, 181346: 1.0})
tf = HashingTF(inputCol="filtered",
               outputCol="rawFeatures").transform(filteredTermsData)

# IDF: IDF is an Estimator which is fit on a dataset and produces an IDFModel. The IDFModel takes feature vectors (generally created from HashingTF or
# CountVectorizer) and scales each column. Intuitively, it down-weights columns which appear frequently in a corpus.
idf = IDF(inputCol="rawFeatures", outputCol="features").fit(tf)

# TF-IDF
tfidf = idf.transform(tf)

labels = data.map(lambda doc: doc["label"]  # Standard Python dict access 
                  )

# Training and Test datasets
# Here feature#5 contains the data for training, for example
# [Row(label=0.0, content=u'We start learning Spark', terms=[u'we', u'start', u'learning', u'spark'], filtered=[u'start', u'learning', u'spark'],
# rawFeatures=SparseVector(262144, {29470: 1.0, 62173: 1.0, 181346: 1.0}), features=SparseVector(262144, {29470: 0.9163, 62173: 0.9163, 181346: 0.9163}))]

Esempio n. 20

0

Mostra file

File: NLP_Incident_Notes.py Progetto: jennitian/gun-violence


# In[14]:


# filter rows where n_killed > 2
notes_length_df = notes_length_df.filter(notes_length_df.label <= 2)


# In[15]:


# create features
tokenizer = Tokenizer(inputCol="notes", outputCol="token_notes")
stopremove = StopWordsRemover(inputCol='token_notes',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="token_notes", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')


# In[16]:


# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'notes_length'], outputCol='features')


# In[17]:


# Create and run a data processing Pipeline
data_prep_pipeline = Pipeline(stages=[tokenizer, stopremove, hashingTF, idf, clean_up])

Esempio n. 21

0

Mostra file

File: pipeline_explore.py Progetto: chrisgarcia001/Explorations

        docs = docs.union(
            next_docs.map(lambda (doc, lines):
                          (format_text(lines), float(curr_cat))))
        curr_cat += 1

    training_rows = docs.sample(False, train_fraction)
    testing_rows = docs.subtract(training_rows)

    # Prepare training and test documents, which are labeled.
    LabeledDocument = Row("text", "label")
    train = training_rows.map(lambda x: LabeledDocument(*x)).toDF()
    test = testing_rows.map(lambda x: LabeledDocument(*x)).toDF()

    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="rawFeatures")  #outputCol="features")
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    lr = LogisticRegression(maxIter=1000, regParam=0.001)
    #pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    p0 = Pipeline(stages=[tokenizer, hashingTF, idf, lr])
    #m0 = p0.fit(train)
    #pipeline = Pipeline(stages=[m0, lr])
    pipeline = p0

    # Fit the pipeline to training documents.
    model = pipeline.fit(train)
    print('\n\n --------------- RESULT ----------------------\n\n')
    print(model.transform(test).head())
    print('\n\n ---------------------------------------------\n\n')

Esempio n. 22

0

Mostra file

# load data
df0 = spark.read.csv("./jobs_clean.csv", header=True, multiLine=True, inferSchema=True)
df1 = pd.read_csv('./jobs_clean.csv')
#df0.show()
print('The number of jobs：',df0.count())
print('\nthe distinct jobs name: ', df1.job.unique())
print('\nThere are', len(df1.job.unique())-1, 'different kinds of jobs in the table.')

# split the desc field
tokenizer = Tokenizer(inputCol='desc_clean', outputCol='desc_words')
df = tokenizer.transform(df0)
#df.show()
#df.select('desc_words').show(10)

# compute TF-IDF
hashingTF = HashingTF(inputCol='desc_words', outputCol='desc_words_tf')
tf = hashingTF.transform(df).cache()
idf = IDF(inputCol='desc_words_tf', outputCol='desc_words_tfidf').fit(tf)
tfidf = idf.transform(tf).cache()
#print('tfidf for each job:', tfidf.select('desc_words_tfidf').show(10,truncate=False))

# data normalization
from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol="desc_words_tfidf", outputCol="norm")
tfidf = normalizer.transform(tfidf)
#tfidf.select("id", "norm").show(6)

# compute similarity between jobs and resume
import pyspark.sql.functions as psf 
from pyspark.sql.types import DoubleType
print('\nCompute the similarity between jobs and resume...')

Esempio n. 23

0

Mostra file

File: News Headline Classification.py Progetto: emil-maynard/data_science_scripts

def news_classifier():

    data = spark.read.option("mode", "DROPMALFORMED").load("/news_data.csv", format="csv", header="true", inferSchema='true')

    data.first()
    data.printSchema()

    #There is a field in the data called constituent_id, which is basically the company which the news headline is about. We want to drop that column from our data.
    drop_list = ['constituent_id']

    data = data.select([column for column in data.columns if column not in drop_list])

    data.show(5)

    data.printSchema()

    # regular expression tokenizer
    regexTokenizer = RegexTokenizer(inputCol="news_title", outputCol="words", pattern="\\W")

    # remove stop words
    stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")

    #compute bigrams
    ngram = NGram(n=2, inputCol="filtered", outputCol="ngrams")

    # Add HashingTF and IDF to transformation
    hashingTF = HashingTF(inputCol="ngrams", outputCol="rawFeatures", numFeatures=10000)
    idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

    #string indexer
    label_stringIdx = StringIndexer(inputCol = "weekly_returns", outputCol = "label")

    #create processing pipeline
    pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, ngram, hashingTF, idf, label_stringIdx])

    # Fit the pipeline to training data.
    pipelineFit = pipeline.fit(data)
    dataset = pipelineFit.transform(data)

    dataset.show(5)

    # Randomly split data into training and test sets. set seed for reproducibility
    (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
    print("Training Dataset Count: " + str(trainingData.count()))
    print("Test Dataset Count: " + str(testData.count()))

    # Build a Logistic Regression model
    lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0, family="multinomial")

    # Train model with Training Data
    lrModel = lr.fit(trainingData)

    predictions = lrModel.transform(testData)

    predictions.filter(predictions['prediction'] == 0) \
        .select("news_title","weekly_returns","probability","label","prediction") \
        .orderBy("probability", ascending=False) \
        .show(n = 10, truncate = 30)

    #multiclass evaluator
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    print(evaluator.evaluate(predictions))

    #save predictions to csv

    predictions = predictions.select("news_title", "weekly_returns", "prediction")
    predictions.write.format("csv").save("/Desktop/predictions-spark.csv")

    #save machine learning model
    model_path = "/Desktop/Spark_Model"
    lrModel.save(model_path)

    #load model again, to make sure it works
    ml_model = lrModel.load(model_path)
    predictions2 = ml_model.transform(testData)

    #make predictions with loaded model
    predictions2.filter(predictions2['prediction'] == 0) \
        .select("news_title","weekly_returns","probability","label","prediction") \
        .orderBy("probability", ascending=False) \
        .show(n = 10, truncate = 30)

    #end spark session
    spark.stop()

Esempio n. 24

0

Mostra file

File: compare_Multiclass_Models.py Progetto: PB12203006/Largedata

        .master("local") \
        .appName("Compare Multiclass Models") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
numfeatures=2000
numclasses = 19
# Load news category data
raw_data = sc.textFile("data/news_sections_abstract2016.txt")
lines = raw_data.map(lambda line: line.split("  ")).map(lambda line: (line[0]," ".join(line[1:])))
sentenceData = spark.createDataFrame(lines,["label", "sentence"])

# Map sentence data to hashingTF
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=numfeatures)
featurizedData = hashingTF.transform(wordsData)
#featurizedData.show()

# Map string labels to integer
df = featurizedData.select('label','features')
data0 = df.replace(['World','Sports','Fashion & Style','Books','Music', \
            'Television','Movies','Technology','Science','Food','Real Estate','Theater', \
            'Health','Travel','Education','Your Money','Politics','Economy','Art & Design'] \
            ,['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','0'],'label')

category = ['Art & Design','World','Sports','Fashion & Style','Books','Music', \
            'Television','Movies','Technology','Science','Food','Real Estate','Theater', \
            'Health','Travel','Education','Your Money','Politics','Economy']
dictionary = {'Art & Design':0,'World':1,'Sports':2,'Fashion & Style':3,'Books':4,'Music':5, \
            'Television':6,'Movies':7,'Technology':8,'Science':9,'Food':10,'Real Estate':11,'Theater':12, \

Esempio n. 25

0

Mostra file

File: main.py Progetto: Pentahill/basic

from environment import spark
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = spark.createDataFrame(
    [(0.0, "Hi I I I heard about Spark"),
     (0.0, "I wish Java could use case classes"),
     (1.0, "Logistic regression models are neat")], ["label", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
hashingTF = HashingTF(inputCol="words",
                      outputCol="rawFeatures",
                      numFeatures=100)
featurizedData = hashingTF.transform(wordsData)
featurizedData.show(truncate=False)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.show(truncate=False)

Esempio n. 26

0

Mostra file

File: Content_based.py Progetto: whshih/INF-553-Competition-Project-

def main():
    time_start = time.time()
    data = 'train_review.json'  # sys.argv[1]
    sc = elly_func.start_spark('Final_Project')

    # total pairs = 1029758
    textRDD = sc.textFile(data).map(
        elly_func.tojson).map(lambda x: ((x['user_id'], x['business_id']), x[
            'text'])).reduceByKey(lambda a, b: a + b).mapValues(
                remove_blank).map(lambda x: (x[0][0], x[0][1], x[1]))

    # Create DataFrame
    tableA = spark_session(textRDD).createDataFrame(
        textRDD, ['user_id', 'business_id', 'text'])

    # Remove stopwords
    remover = StopWordsRemover(inputCol="text", outputCol="filtered")

    # text不用了 拉掉
    df = remover.transform(tableA).drop('text')

    # 轉換成tokenizer要的形式 string
    test = df.withColumn("sentence", df["filtered"].cast("string"))

    # 將句子篩選有用字詞
    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    wordsData = tokenizer.transform(test)

    # TF-IDF 高頻200字篩選
    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=200)
    featurizedData = hashingTF.transform(wordsData)

    # 字詞向量化
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    # user與business與字詞關係 按字邏輯排列
    user_profile = rescaledData.select('user_id', 'business_id',
                                       'features').orderBy("user_id")
    business_profile = rescaledData.select('business_id',
                                           'features').orderBy("business_id")

    def set2list(x):
        temp = []
        for i in x:
            temp.append(i)
        return temp

    # user與business與字詞關係 字典建立

    # 91730
    # Total number of [('business_id', ['case','eat',...]),...]
    business_dic = business_profile.rdd.map(lambda x: (x[0], list(x[
        1].indices))).reduceByKey(lambda a, b: a + b).mapValues(
            lambda x: set2list(set(x))).collectAsMap()

    # 13167
    user_dic = user_profile.rdd.map(lambda x: (x[0], list(x[
        2].indices))).reduceByKey(lambda a, b: a + b).mapValues(
            lambda x: set2list(set(x))).collectAsMap()
    # user_profile_dic = {'user1': [word2, word8, word24,....],.....}
    user_bus = sc.textFile(data).map(
        elly_func.tojson).map(lambda x: ((x['user_id'], x['business_id']))
                              ).reduceByKey(lambda a, b: a + b).collectAsMap()

    #   差cos sim
    time_end = time.time()
    print('Duration:', time_end - time_start)

Esempio n. 27

0

Mostra file

def test_gen_estimator_metadata(spark_session):  # pylint: disable=unused-argument
    tokenizer1 = Tokenizer(inputCol="text1", outputCol="words1")
    hashingTF1 = HashingTF(inputCol=tokenizer1.getOutputCol(),
                           outputCol="features1")

    tokenizer2 = Tokenizer(inputCol="text2", outputCol="words2")
    hashingTF2 = HashingTF(inputCol=tokenizer2.getOutputCol(),
                           outputCol="features2")

    vecAssembler = VectorAssembler(inputCols=["features1", "features2"],
                                   outputCol="features")

    lor = LogisticRegression(maxIter=10)
    ova = OneVsRest(classifier=lor)
    sub_pipeline1 = Pipeline(stages=[tokenizer1, hashingTF1])
    sub_pipeline2 = Pipeline(stages=[tokenizer2, hashingTF2])
    sub_pipeline3 = Pipeline(stages=[vecAssembler, ova])

    paramGrid = (ParamGridBuilder().addGrid(lor.maxIter, [10, 20]).addGrid(
        lor.regParam, [0.1, 0.01]).build())
    eva = MulticlassClassificationEvaluator()
    crossval = CrossValidator(estimator=sub_pipeline3,
                              estimatorParamMaps=paramGrid,
                              evaluator=eva,
                              numFolds=2)

    top_pipeline = Pipeline(stages=[sub_pipeline1, sub_pipeline2, crossval])

    metadata = _gen_estimator_metadata(top_pipeline)

    expected_hierarchy = {
        "name":
        "Pipeline_1",
        "stages": [
            {
                "name": "Pipeline_2",
                "stages": [{
                    "name": "Tokenizer_1"
                }, {
                    "name": "HashingTF_1"
                }]
            },
            {
                "name": "Pipeline_3",
                "stages": [{
                    "name": "Tokenizer_2"
                }, {
                    "name": "HashingTF_2"
                }]
            },
            {
                "name": "CrossValidator",
                "evaluator": {
                    "name": "MulticlassClassificationEvaluator"
                },
                "tuned_estimator": {
                    "name":
                    "Pipeline_4",
                    "stages": [
                        {
                            "name": "VectorAssembler"
                        },
                        {
                            "name": "OneVsRest",
                            "classifier": {
                                "name": "LogisticRegression"
                            }
                        },
                    ],
                },
            },
        ],
    }
    assert metadata.hierarchy == expected_hierarchy
    assert metadata.uid_to_indexed_name_map == {
        top_pipeline.uid: "Pipeline_1",
        sub_pipeline1.uid: "Pipeline_2",
        tokenizer1.uid: "Tokenizer_1",
        hashingTF1.uid: "HashingTF_1",
        sub_pipeline2.uid: "Pipeline_3",
        tokenizer2.uid: "Tokenizer_2",
        hashingTF2.uid: "HashingTF_2",
        crossval.uid: "CrossValidator",
        sub_pipeline3.uid: "Pipeline_4",
        vecAssembler.uid: "VectorAssembler",
        ova.uid: "OneVsRest",
        lor.uid: "LogisticRegression",
        eva.uid: "MulticlassClassificationEvaluator",
    }
    assert (metadata.uid_to_indexed_name_map[
        metadata.param_search_estimators[0].uid] == "CrossValidator")

Esempio n. 28

0

Mostra file

File: sparkOptimizedNaiveBayes.py Progetto: liyishanamy/detect_toxic_comment

train = train.withColumn("comment_text", stemmer_udf("comment_text"))

def check_clean(toxic, severe_toxic, obscene, threat, insult, identity_hate):
    if (toxic + severe_toxic + obscene + threat + insult + identity_hate) > 0:
        return 0
    else:
        return 1


mergeCols = udf(lambda toxic, severe_toxic, obscene, threat, insult, identity_hate: check_clean(toxic, severe_toxic, obscene, threat, insult, identity_hate), IntegerType())

train = train.withColumn("clean", mergeCols(train["toxic"], train["severe_toxic"], train["obscene"], train["threat"], train["insult"], train["identity_hate"]))

tokenizer = Tokenizer().setInputCol("comment_text").setOutputCol("words")
remover= StopWordsRemover().setInputCol("words").setOutputCol("filtered").setCaseSensitive(False)
hashingTF = HashingTF().setNumFeatures(1000).setInputCol("filtered").setOutputCol("rawFeatures")
idf = IDF().setInputCol("rawFeatures").setOutputCol("features").setMinDocFreq(0)
nb = NaiveBayes(labelCol="label", featuresCol="features")
pipeline=Pipeline(stages=[tokenizer, remover, hashingTF, idf, nb])

train = train.drop('toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate')
train = train.withColumnRenamed("clean", "label")

training_spark_df_binary, testing_spark_df_binary = train.randomSplit([0.8, 0.2], seed = 2018)



paramGrid = ParamGridBuilder()\
    .addGrid(hashingTF.numFeatures,[1000]) \
    .addGrid(nb.smoothing, [1]) \
    .build()

Esempio n. 29

0

Mostra file

conf = SparkConf()
conf.setAppName( "part2_uni" )
conf.set("spark.executor.memory", "2g")
sc = SparkContext(conf = conf)

#reading input
lines =sc.wholeTextFiles("/cosc6339_s17/books-longlist/")
#configuring SparkSession
spark=SparkSession(sc)
hasattr(lines, "toDF")

#tokeinizing the words and converting into dataframes
tokenize=lines.map(part2).toDF(["bookname", "words"])

#converting into unigrams
unigram = NGram(n=1, inputCol = "words", outputCol = "unigrams")
unigramdataframe = unigram.transform(tokenize)

#finding the tf value
hashingTF = HashingTF(inputCol = "unigrams", outputCol = "unigram-tf")
tf = hashingTF.transform(unigramdataframe)

#finding the idf value
idf = IDF(inputCol = "unigram-tf", outputCol = "unigram-tf-idf")
idfModel = idf.fit(tf)
tfidfignore = idfModel.transform(tf)

#saving the output
tfidfignore.rdd.saveAsTextFile("/bigd12/output2_1")

Esempio n. 30

0

Mostra file

File: Ex_14_Titanic_RF.py Progetto: mullerhai/PySpark-Tutorial

    test.cache()

    regexTokenizer = RegexTokenizer(gaps=False,
                                    pattern="\\w+",
                                    inputCol="name",
                                    outputCol="name_parts",
                                    toLowercase=True)

    stopWords = ["mr", "mrs", "miss", "master", "jr", "j", "c", "d"]

    remover = StopWordsRemover(inputCol="name_parts",
                               outputCol="filtered_name_parts",
                               stopWords=stopWords)

    hashingTF = HashingTF(numFeatures=1000,
                          inputCol="filtered_name_parts",
                          outputCol="text_features")

    sexIndexer = StringIndexer(inputCol="sex",
                               outputCol="sexIndexed",
                               handleInvalid="keep")

    embarkedIndexer = StringIndexer(inputCol="embarked",
                                    outputCol="embarkedIndexed",
                                    handleInvalid="keep")

    imputer = Imputer(strategy="mean",
                      inputCols=[
                          "pclass", "sibsp", "parch", "sexIndexed",
                          "embarkedIndexed", "age", "fare"
                      ],