Esempio n. 1
0
def fit_kmeans(spark, products_df):
    step = 0

    step += 1
    tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer")

    step += 1
    stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords")

    step += 1
    tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16)

    step += 1
    idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf")

    step += 1
    normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer")

    step += 1
    kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20)

    kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans])

    model = kmeans_pipeline.fit(products_df)
    words_prediction = model.transform(products_df)
    model.save("./kmeans")  # the whole machine learning instance is saved in a folder
    return model, words_prediction
Esempio n. 2
0
    def featureExtract(self, trainDataframe, predictionDataframe):
        pipeline = None
        try:
            pipeline = Pipeline.load(ROOT_PATH + '/pipeline')
        except Exception:
            print Exception.message
            self.logger.error(Exception)
        if pipeline is None:
            # tokenizer = Tokenizer(inputCol="keywords", outputCol="words")
            remover = StopWordsRemover(inputCol="keywords",
                                       outputCol="filtered")
            # 设置停用词
            remover.setStopWords(self.cuttingMachine.chineseStopwords())
            hashingTF = HashingTF(inputCol=remover.getOutputCol(),
                                  outputCol="features")
            idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idff")
            # lr = LogisticRegression(maxIter=10, regParam=0.001)
            pipeline = Pipeline(stages=[remover, hashingTF, idf])
        model = pipeline.fit(trainDataframe)
        pipeline.write().overwrite().save(ROOT_PATH + '/pipeline')
        resultDataframe = model.transform(predictionDataframe)
        resultDataframe.show()
        selected = resultDataframe.select("filtered", "features", "idff")

        for row in selected.collect():
            filtered, features, idff = row
            self.logger.info("features: %s", features)
            self.logger.info("idff: %s", idff)
            self.logger.info(
                "filtered: %s",
                str(filtered).decode("unicode_escape").encode("utf-8"))
        return selected
    def train_lg(training_data, collection):
        # Configure an ML pipeline, which consists of the following stages: hashingTF, idf, and lr.
        hashingTF = HashingTF(inputCol="filtered", outputCol="TF_features")
        idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
        pipeline1 = Pipeline(stages=[hashingTF, idf])

        # Fit the pipeline1 to training documents.
        model1 = pipeline1.fit(training_data)

        lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
        pipeline2 = Pipeline(stages=[model1, lr])

        paramGrid = ParamGridBuilder() \
            .addGrid(hashingTF.numFeatures, [10, 100, 1000, 10000]) \
            .addGrid(lr.regParam, [0.1, 0.01]) \
            .build()

        crossval = CrossValidator(estimator=pipeline2,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=BinaryClassificationEvaluator(),
                                  numFolds=5)

        # Run cross-validation, and choose the best set of parameters.
        cvModel = crossval.fit(training_data)

    #     model_path = os.path.join(models_dir , time.strftime("%Y%m%d-%H%M%S") + '_'
    #                             + collection["Id"] + '_'
    #                             + collection["name"])
    #     cvModel.save(sc, model_path)
        return cvModel
def train_svm_idf(sqlContext, df):

    training, test = df.randomSplit([0.8, 0.2])

    tokenizer = Tokenizer(inputCol="body", outputCol="words")

    hashingTF = HashingTF(numFeatures=2000,
                          inputCol=tokenizer.getOutputCol(),
                          outputCol="rawFeatures")

    idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
    svm = LinearSVC(featuresCol="features", labelCol="label")

    pipline = Pipeline(stages=[tokenizer, hashingTF, idf, svm])
    model = pipline.fit(training)

    test_df = model.transform(test)
    train_df = model.transform(training)

    test_df.show()
    train_df.show()

    evaluator = BinaryClassificationEvaluator(labelCol="label")
    """rawPredictionCol="prediction","""

    train_metrix = evaluator.evaluate(train_df)
    test_metrix = evaluator.evaluate(test_df)
    test_p = test_df.select("prediction").rdd.map(
        lambda x: x['prediction']).collect()
    test_l = test_df.select("label").rdd.map(lambda x: x['label']).collect()
    train_p = train_df.select("prediction").rdd.map(
        lambda x: x['prediction']).collect()
    train_l = train_df.select("label").rdd.map(lambda x: x['label']).collect()

    print("\n\n\n\n")
    print("-" * 15 + " OUTPUT " + "-" * 15)
    print()
    print("confusion matrix for trainning data")
    print(train_metrix)
    print("train label")
    print(train_l)
    print("train prediction")
    print(train_p)
    print("-" * 30)
    print()
    print("confusion matrix for testing data")
    print(test_metrix)
    print("test label")
    print(test_l)
    print("test prediction")
    print(test_p)

    print("-" * 30)
    print("\n\n\n\n")
Esempio n. 5
0
    def __init__(self, data):
        tokenizer = Tokenizer(inputCol="text", outputCol="words")

        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures")

        idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")

        lr = LogisticRegression()

        pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, lr])

        self.model = pipeline.fit(data)
Esempio n. 6
0
def nb_train(data):
    #Naive Bayes Classifier
    label_stringIdx = StringIndexer(inputCol="_c0", outputCol="label")
    lsmodel=label_stringIdx.fit(data)
    data=lsmodel.transform(data)
    (trainingData, testData) = data.randomSplit([0.9, 0.1], seed=100)
    countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)
    hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000)
    idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features", minDocFreq=5)
    nb = NaiveBayes(smoothing=1)
    pipeline = Pipeline(stages=[countVectors,nb])
    pipelineFit = pipeline.fit(trainingData)
    predictions = pipelineFit.transform(testData)
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")

    return (evaluator.evaluate(predictions),lsmodel.labels,pipelineFit)
    def pipeline(self):

        from pyspark.ml import Pipeline
        from pyspark.ml.feature import HashingTF, IDF
        from pyspark.ml.feature import Tokenizer
        from pyspark.ml.classification import LogisticRegression

        tokenizer = Tokenizer(inputCol="message", outputCol="words")
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                              outputCol="tempfeatures")
        idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
        lrClassifier = LogisticRegression()

        pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, lrClassifier])

        return pipeline
Esempio n. 8
0
def nb_train_cv(data):
    #Naive Bayes Classifier
    label_stringIdx = StringIndexer(inputCol="_c0", outputCol="label")
    lsmodel=label_stringIdx.fit(data)
    data=lsmodel.transform(data)
    data.cache()
    #(trainingData, testData) = data.randomSplit([0.9, 0.1], seed=100)
    countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
    hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000)
    idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features", minDocFreq=5)
    nb = NaiveBayes()
    pipeline = Pipeline(stages=[countVectors,nb])
    grid = ParamGridBuilder().addGrid(nb.smoothing, [1]).build()
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=grid,
                              evaluator=evaluator,
                              numFolds=10)
    cvmodel = crossval.fit(data)
    return (evaluator.evaluate(cvmodel.transform(data)), lsmodel.labels, cvmodel)
def create_pipeline(model_type, num_features=10000):
    """
    Defines pipeline from BOW to prediction.
    """

    remover = StopWordsRemover(inputCol="bow", outputCol="words")
    hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts", numFeatures=num_features)
    tfidf = IDF(inputCol=hashingTF.getOutputCol(),
                outputCol="features")

    if model_type == 'log_reg':
        model = LogisticRegression()
    elif model_type == 'gbt':
        model = GBTClassifier()
    elif model_type == 'naive_bayes':
        model = NaiveBayes()
    elif model_type == 'rf':
        model = RandomForestClassifier()

    return Pipeline(stages=[remover, hashingTF, tfidf,
                                model])
Esempio n. 10
0
class BaselinePipelineEngine(PipelineEngine):
    @keyword_only
    def __init__(self, cv):
        super(BaselinePipelineEngine, self).__init__(cv)
        self.hashing_tf_map = [pow(2, 20)]
        self.lr_map = [0.1, 0.01]
        self.stages = self._build_stages()
        self.pipeline = Pipeline(stages=[self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr])
        self.param_grid = self._build_param_grid()

    def _build_stages(self):
        self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed")
        self.tokenizer = Tokenizer(inputCol=self.bs_parser.getOutputCol(), outputCol="words")
        self.hashing_tf = HashingTF(inputCol=self.tokenizer.getOutputCol(), outputCol="raw_features")
        self.idf_model = IDF(inputCol=self.hashing_tf.getOutputCol(), outputCol="features")
        self.lr = LogisticRegression(maxIter=10, regParam=0.01)
        return [self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr]

    def _build_param_grid(self):
        param_grid_builder = ParamGridBuilder()
        param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map)
        param_grid_builder.addGrid(self.lr.regParam, self.lr_map)
        return param_grid_builder.build()
Esempio n. 11
0
                                        count_verbs_udf(split_df['words']))
    has_q_df = verb_count_df.withColumn('has_q',
                                        check_q_udf(verb_count_df['text']))
    stem_df = has_q_df.withColumn('words', stem_udf(has_q_df['words']))
    no_dupes_df = stem_df.dropDuplicates(['words'])
    no_emptys_df = no_dupes_df.filter(no_dupes_df['word_count'] > 1)

    # Split data set
    training_df, testing_df = no_emptys_df.randomSplit([.75, .25])

    # Make Spark ML pipeline using a NaiveBayes classifier (for now)
    hashingTF = HashingTF(inputCol='words',
                          outputCol='word_hash',
                          numFeatures=500)
    idf = IDF(minDocFreq=1,
              inputCol=hashingTF.getOutputCol(),
              outputCol='tf-idf')
    va = VectorAssembler(inputCols=[
        'has_link', 'verb_count', 'tf-idf', 'word_count', 'has_q', 'has_tag'
    ])
    mp = MultilayerPerceptronClassifier(
        featuresCol=va.getOutputCol(),
        layers=[505, 250, 100, 50, 25, 10, 5, 2])

    # Create param grid
    grid = ParamGridBuilder().addGrid(mp.maxIter, [50, 100, 200]).addGrid(
        mp.tol,
        [.0000001, .000001, .0001, .01]).addGrid(mp.stepSize,
                                                 [.001, .01, .1]).build()

    evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction')
Esempio n. 12
0
rdd = labeledRdd.map(lambda doc: (cleanLower(doc[0]), doc[1]))

print "Text is cleaned"

sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ["review", "label"])
dfTrain, dfTest = df.randomSplit([0.8, 0.2])

print "Random split is done"

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words("english"))
)
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf")
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed")
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)

pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])


# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************

evaluator = MulticlassClassificationEvaluator(
    predictionCol="prediction", labelCol="target_indexed", metricName="precision"
)
Esempio n. 13
0
        return dataset.withColumn(
            out_col,
            udf(lambda x: LabeledPoint(1, Vectors.fromML(x)), t)(in_col))


# COMMAND ----------

# MAGIC %md
# MAGIC #### Create data processing pipeline

# COMMAND ----------

# Configure an ML pipeline, which consists of four stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
labelPointTF = LabelPointTF(inputCol=hashingTF.getOutputCol(),
                            outputCol="vectors")
lsvc = LinearSVC(maxIter=10, regParam=0.1)
pipeline = Pipeline(stages=[tokenizer, hashingTF, labelPointTF, lsvc])

# COMMAND ----------

# MAGIC %md
# MAGIC #### Train Email Spam Classifier Model

# COMMAND ----------

model = pipeline.fit(
    emails.select("text",
                  column("spam").alias("label").cast(IntegerType())))
Esempio n. 14
0
# Make predictions on the testing data
predictions = pipeline.transform(flights_test)

--------------------------------------------------
# Exercise_3 
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash")
idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])

--------------------------------------------------
# Exercise_4 
# Create an empty parameter grid
params = ParamGridBuilder().build()

# Create objects for building and evaluating a regression model
regression = LinearRegression(labelCol='duration')
evaluator = RegressionEvaluator(labelCol='duration')

# Create a cross validator
Esempio n. 15
0
    sqc = SQLContext(sc)
    sm = SparkModel(sc, conn, rdd_path='meta_rdd.pkl')

    logging.basicConfig(format='%(asctime)s %(message)s')
    grid_search = logging.getLogger('main')
    grid_search.setLevel(logging.DEBUG)
    handler = logging.FileHandler('../logs/grid_search.txt')
    grid_search.addHandler(handler)

    bow_rdd = sm.RDD.map(lambda (key, (bow, meta)): (key, bow))
    bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow))

    remover = StopWordsRemover(inputCol="raw", outputCol="words")
    hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts",
                numFeatures=10000)
    tfidf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features",
                minDocFreq=20)
    indexer = StringIndexer(inputCol="string_label", outputCol="label")

    for model in [GBTClassifier(), RandomForestClassifier(), MultilayerPerceptronClassifier()]:

        if type(model) == MultilayerPerceptronClassifier:
            layers = [10000, 100, 2]
            model = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128)

        pipeline = Pipeline(stages=[remover, hashingTF, tfidf, # scaler,
                                    indexer, model])
        scores = cross_val_score(pipeline, bow_rdd)
        grid_search.debug('Model: %s\nscores: %s\nAverage: %s' \
                % (type(model), scores, scores.mean()))
print "Text is cleaned"

sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8, 0.2])

print "Random split is done"

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review",
    outputCol="wordsNoSw",
    stopwords=set(nltk.corpus.stopwords.words('english')))
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(),
                       outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = LogisticRegression(featuresCol=idf.getOutputCol(),
                        labelCol=string_indexer.getOutputCol(),
                        maxIter=30,
                        regParam=0.01)

pipeline = Pipeline(
    stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                              labelCol='target_indexed',
                                              metricName='precision')

# grid=(ParamGridBuilder()
#      .baseOn([evaluator.metricName,'precision'])
Esempio n. 17
0
###############################################################################################
# Pipeline
###############################################################################################
# Tokenize by word 
tokenizer = Tokenizer(inputCol="text", outputCol="words")
# Remove stop words in the text
stopword = StopWordsRemover(inputCol = tokenizer.getOutputCol(), outputCol = "no_stops")
# The cheaper way to do TF-IDF 
# Creates a hash that contains the term frequency 
# This mean there are no pairs with the value 0 
# It'll output: (number_of_words {index_from_previous: value, ...}) with no value = 0 
# If the value is 0, the index_from_previous will skip so there can be key that go 
# 0, 1, 6, 8, ... etc all based on the contents of the previous step
hashingTF = HashingTF(inputCol= stopword.getOutputCol(), outputCol="hashing")
# Performs the IDF part in TF-IDF 
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features", minDocFreq=5) 
# Appends output Token-Stopwords-HashingTF-IDF with output of Vader
# Initialize Logistic Regression 
lr = LogisticRegression(maxIter=10, regParam=0.001)
# Creates pipeline 
pipeline = Pipeline(stages=[ tokenizer, stopword,hashingTF,idf, lr])

###############################################################################################
# Fit model to training set

#lr_model = PipelineModel.load('./ModelTest')
lr_model = pipeline.fit(train)
# Make predictions on test set
lr_prediction = lr_model.transform(test)
# Schema of prediction outcome
print(lr_prediction.printSchema())
                           gaps=False,
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                    outputCol="filtered",
                                    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
                                    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="tf_idf",
          minDocFreq=4)

## Create H2OAutoML model
automl = H2OAutoML(convertUnknownCategoricalLevelsToNa=False,
                   seed=1,
                   maxRuntimeSecs=300, # 5 minutes
                   predictionCol="label")

## Remove all helper columns
colPruner = ColumnPruner(columns=[idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol()])

## Create the pipeline by defining all the stages
pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, hashingTF, idf, automl, colPruner])
                           gaps=False,
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                    outputCol="filtered",
                                    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
                                    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="tf_idf",
          minDocFreq=4)


if algo == "gbm":
    ## Create GBM model
    algoStage = H2OGBM(ratio=0.8,
                 seed=1,
                 featuresCols=[idf.getOutputCol()],
                 predictionCol="label")
elif algo == "dl":
    ## Create H2ODeepLearning model
    algoStage = H2ODeepLearning(epochs=10,
                         seed=1,
                         l1=0.001,
normalizerBi = Normalizer(inputCol="bigrams",outputCol='normBigrams',p=2.0)
dfNorm = normalizerUni.transform(dfVect2)
dfNorm2 = normalizerBi.transform(dfNorm)
print "DataFrame(bi-gram): normalisé"
dfNorm2.select('words','normWords').show()
# La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements 
# non nuls et pas leur valeur
# On passe au TFIDF
# Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs
# à n'importz quelle colonne (bigrammes, avec stop words ou sans...)
from pyspark.ml.feature import HashingTF
htf = HashingTF(inputCol='words',outputCol='wordsTF',numFeatures=10000)
dfTrainTF = htf.transform(dfTrainTokNoSw)
# INverse doc frequency
from pyspark.ml.feature import IDF
idf = IDF(inputCol=htf.getOutputCol(),outputCol="wordsTFIDF")
idfModel = idf.fit(dfTrainTF)
dfTrainTFIDF = idfModel.transform(dfTrainTF)
dfTrainTFIDF.select('review','wordsTF','wordsTFIDF').show()

# Je sais que cette étape m'a été utile une fois, la ça a pas trop l'air
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainTFIDF)
dfTrainFinal = string_indexer_model.transform(dfTrainTFIDF)
dfTrainFinal.select('review','label','target_indexed').show()



#**********************************************************************
#-----------Training the model for prediction--------------------------
Esempio n. 21
0
###############################################################################################
# Tokenize by word
tokenizer = Tokenizer(inputCol="text", outputCol="words")
# Remove stop words in the text
stopword = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                            outputCol="no_stops",
                            stopWords=swords)
# The cheaper way to do TF-IDF
# Creates a hash that contains the term frequency
# This mean there are no pairs with the value 0
# It'll output: (number_of_words {index_from_previous: value, ...}) with no value = 0
# If the value is 0, the index_from_previous will skip so there can be key that go
# 0, 1, 6, 8, ... etc all based on the contents of the previous step
hashingTF = HashingTF(inputCol=stopword.getOutputCol(), outputCol="hashing")
# Performs the IDF part in TF-IDF
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="features1",
          minDocFreq=5)
# Appends output Token-Stopwords-HashingTF-IDF with output of Vader
assembler = VectorAssembler(inputCols=["features1", "vader"],
                            outputCol="features")
# Initialize Logistic Regression
lr = LogisticRegression(maxIter=10, regParam=0.001)
# Creates pipeline
pipeline = Pipeline(
    stages=[tokenizer, stopword, hashingTF, idf, assembler, lr])

###############################################################################################
# Fit model to training set

#lr_model = PipelineModel.load('./ModelTest')
normalizerBi = Normalizer(inputCol="bigrams", outputCol='normBigrams', p=2.0)
dfNorm = normalizerUni.transform(dfVect2)
dfNorm2 = normalizerBi.transform(dfNorm)
print "DataFrame(bi-gram): normalisé"
dfNorm2.select('words', 'normWords').show()
# La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements
# non nuls et pas leur valeur
# On passe au TFIDF
# Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs
# à n'importz quelle colonne (bigrammes, avec stop words ou sans...)
from pyspark.ml.feature import HashingTF
htf = HashingTF(inputCol='words', outputCol='wordsTF', numFeatures=10000)
dfTrainTF = htf.transform(dfTrainTokNoSw)
# INverse doc frequency
from pyspark.ml.feature import IDF
idf = IDF(inputCol=htf.getOutputCol(), outputCol="wordsTFIDF")
idfModel = idf.fit(dfTrainTF)
dfTrainTFIDF = idfModel.transform(dfTrainTF)
dfTrainTFIDF.select('review', 'wordsTF', 'wordsTFIDF').show()

# Je sais que cette étape m'a été utile une fois, la ça a pas trop l'air
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainTFIDF)
dfTrainFinal = string_indexer_model.transform(dfTrainTFIDF)
dfTrainFinal.select('review', 'label', 'target_indexed').show()

#**********************************************************************
#-----------Training the model for prediction--------------------------
#**********************************************************************
Esempio n. 23
0
from pyspark.ml.feature import RegexTokenizer

tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W+")

# COMMAND ----------

# MAGIC %md
# MAGIC Create a `HashingTF` transformer to hash words to buckets with counts, then use an `IDF` estimator to compute inverse-document frequency for buckets based on how frequently words have hashed to those buckets in the given documents.  Next, normalize the tf-idf values so that the \\( l^2 \\) norm is one for each row.

# COMMAND ----------

from pyspark.ml.feature import IDF, HashingTF, Normalizer

hashingTF = HashingTF().setNumFeatures(10000).setInputCol(tokenizer.getOutputCol()).setOutputCol("hashingTF")

idf = IDF().setMinDocFreq(10).setInputCol(hashingTF.getOutputCol()).setOutputCol("idf")

normalizer = Normalizer().setInputCol(idf.getOutputCol()).setOutputCol("features")

# COMMAND ----------

# MAGIC %md
# MAGIC Now, let's build the `KMeans` estimator and a `Pipeline` that will contain all of the stages.  We'll then call fit on the `Pipeline` which will give us back a `PipelineModel`.  This will take about a minute to run.

# COMMAND ----------

from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans

kmeans = KMeans().setFeaturesCol("features").setPredictionCol("prediction").setK(5).setSeed(0)
smsXformed = smsData.map(TransformToVector)
smsDf = SpSession.createDataFrame(smsXformed, ["label", "message"])
smsDf.cache()
smsDf.select("label", "message").show()


(trainingData, testData) = smsDf.randomSplit([0.9, 0.1])

from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml.feature import IDF, HashingTF, Tokenizer
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol = 'message', outputCol = 'words')
hashingTF = HashingTF(inputCol = tokenizer.getOutputCol(), outputCol = 'tempfeatures')
idf = IDF(inputCol = hashingTF.getOutputCol(), outputCol = 'features')
nb_classifier = NaiveBayes()

pipeline = Pipeline(stages = [tokenizer, hashingTF, idf, nb_classifier])

nb_model = pipeline.fit(trainingData)
prediction = nb_model.transform(testData)

evaluator = MulticlassClassificationEvaluator(predictionCol = 'prediction',
	labelCol = 'label', metricName = 'accuracy')
evaluator.evaluate(prediction)
evaluator.evaluate(nb_model.transform(trainingData))

prediction.groupBy('label', 'prediction').count().show()
prediction2 = nb_model.transform(trainingData)
prediction2.groupBy('label', 'prediction').count().show()
Esempio n. 25
0
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(
    inputCol=tokenizer.getOutputCol(),
    outputCol="filtered",
    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4)

## Create H2ODeepLearning model
dl = H2ODeepLearning(epochs=10,
                     l1=0.001,
                     l2=0.0,
                     hidden=[200, 200],
                     featuresCols=[idf.getOutputCol()],
                     predictionCol="label")

## Remove all helper columns
colPruner = ColumnPruner(columns=[
    idf.getOutputCol(),
    hashingTF.getOutputCol(),
    stopWordsRemover.getOutputCol(),
    tokenizer.getOutputCol()
Esempio n. 26
0
    bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow)) \
            .sample(withReplacement=False, fraction=.5, seed=1)
    df = sqc.createDataFrame(bow_rdd, ['string_label', 'raw'])
    train_rdd, test_rdd = df.randomSplit([.8, .2], seed=1)
    results = []

    num_features = 5000
    min_doc_freq = 20
    layers = [[5000, 2056, 512, 128, 2], [5000, 1000, 128, 2], [5000, 100, 2], [5000, 5000, 2]]

    for l in layers:
        remover = StopWordsRemover(inputCol="raw", outputCol="words")
        hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts",
                              numFeatures=num_features)
        tfidf = IDF(inputCol=hashingTF.getOutputCol(),
                    outputCol="features", minDocFreq=min_doc_freq)
        indexer = StringIndexer(inputCol="string_label", outputCol="label")

        mlpc = MultilayerPerceptronClassifier(maxIter=100,
                                              layers=l,
                                              blockSize=128)

        pipeline = Pipeline(stages=[remover, hashingTF, tfidf,
                                    indexer, mlpc])

        model = pipeline.fit(train_rdd)
        df_output = model.transform(train_rdd)
        test_output = model.transform(test_rdd).select("label", "prediction")
        score = test_output.rdd.map(lambda row: row.label == row.prediction).mean()
        nn_gridsearch.debug("Layers: %s, Accuracy: %s" % (layers, score))
Esempio n. 27
0
    "spark.some.config.option", "some-value").getOrCreate()

df = spark.read.csv('file:///home/zfar/Sentiment Analysis Dataset.csv',
                    header=True)

df = df.select(df['ItemID'], df['SentimentText'], df['label'])

training = df.selectExpr("cast(itemID as int) id", "SentimentText",
                         "cast(label as int) label")

tokenizer = Tokenizer(inputCol="SentimentText", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                           outputCol="filtered")
ngrams = NGram(n=2, inputCol=remover.getOutputCol(), outputCol="ngrams")
hashingTF = HashingTF(inputCol=ngrams.getOutputCol(), outputCol="rawfeatures")
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idffeatures")
normalizer = Normalizer(inputCol=idf.getOutputCol(),
                        outputCol="features",
                        p=1.0)

#lr = LogisticRegression(maxIter=10, regParam=0.001)
nb = NaiveBayes(smoothing=1.0)
pipeline = Pipeline(
    stages=[tokenizer, remover, ngrams, hashingTF, idf, normalizer, nb])
model = pipeline.fit(training)
"""
paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [10, 100, 1000]).addGrid(lr.regParam, [0.1, 0.01]).build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
Esempio n. 28
0
def Pipeline_model(ngram,
                   nb_hash,
                   data,
                   opm,
                   vec="tf_idf",
                   maxIter=200,
                   regParam=0.01,
                   elasticNetParam=0.0,
                   numTrees=200,
                   maxdepth=16):
    # Fonction tokenizer qui permet de remplacer un long texte par une liste de mot
    regexTokenizer = RegexTokenizer(inputCol="lib1",
                                    outputCol="tokenizedDescr",
                                    pattern="[^a-z_]",
                                    minTokenLength=3,
                                    gaps=True)

    # Fonction StopWordsRemover qui permet de supprimer des mots
    remover1 = remover(inputCol="tokenizedDescr",
                       outputCol="stopTokenizedDescr")
    # Stemmer
    #stemmer = MyNltkStemmer(inputCol="stopTokenizedDescr", outputCol="cleanDescr")
    # Define NGram transformer
    ngram1 = NGram(n=ngram, inputCol="stopTokenizedDescr", outputCol="bigrams")
    # Indexer
    indexer = StringIndexer(inputCol="lib4",
                            outputCol="categoryIndex").fit(data)
    if vec == "tf_idf":
        # Hasing
        hashing_tf = HashingTF(inputCol="bigrams",
                               outputCol='tf',
                               numFeatures=nb_hash)
        # Inverse Document Frequency
        idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="tfidf")

    else:
        Word2Vec_ = Word2Vec(vectorSize=300,
                             minCount=0,
                             inputCol="bigrams",
                             outputCol="tfidf",
                             seed=42)

    assembler = VectorAssembler(inputCols=["tfidf", "credit_o_n"],
                                outputCol="features")

    #Logistic Regression
    if opm == "rl":
        print("use LogisticRegression")
        model = LogisticRegression(maxIter=maxIter,
                                   regParam=regParam,
                                   fitIntercept=False,
                                   tol=0.0001,
                                   family="multinomial",
                                   elasticNetParam=elasticNetParam,
                                   featuresCol="features",
                                   labelCol="categoryIndex")
    elif opm == "rf":
        print("RandomForestClassifier")
        model = RandomForestClassifier(labelCol="categoryIndex",
                                       featuresCol="features",
                                       numTrees=numTrees,
                                       maxMemoryInMB=1000,
                                       maxDepth=maxdepth,
                                       seed=42)
    elif opm == "mp":
        print("MultilayerPerceptronClassifier")
        model = MultilayerPerceptronClassifier(maxIter=maxIter,
                                               labelCol="categoryIndex",
                                               featuresCol="features",
                                               layers=[1001, 70, 26],
                                               blockSize=128,
                                               seed=42)

    else:
        print("GradientBoostedTree")
        model = GBTClassifier(labelCol="categoryIndex",
                              featuresCol="features",
                              maxIter=maxIter,
                              seed=42)
    # Convertion  indexed labels en originale labels.
    labelConverter = IndexToString(inputCol="prediction",
                                   outputCol="predictedLabel",
                                   labels=indexer.labels)
    # Creation du pipeline
    if vec == "tf_idf":
        return Pipeline(stages=[
            regexTokenizer, remover1, ngram1, indexer, hashing_tf, idf,
            assembler, model, labelConverter
        ])
    else:
        return Pipeline(stages=[
            regexTokenizer, remover1, ngram1, indexer, Word2Vec_, assembler,
            model, labelConverter
        ])