Example #1
0
def testPipelineSerialization(craiglistDataset):
    [traningDataset, testingDataset] = craiglistDataset.randomSplit([0.9, 0.1],
                                                                    42)

    tokenizer = RegexTokenizer(inputCol="jobtitle",
                               minTokenLength=2,
                               outputCol="tokenized")
    stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                        outputCol="stopWordsRemoved")
    w2v = H2OWord2Vec(sentSampleRate=0,
                      epochs=10,
                      inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="w2v")
    gbm = H2OGBM(labelCol="category", featuresCols=[w2v.getOutputCol()])

    pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, w2v, gbm])

    pipeline.write().overwrite().save("file://" +
                                      os.path.abspath("build/w2v_pipeline"))
    loadedPipeline = Pipeline.load("file://" +
                                   os.path.abspath("build/w2v_pipeline"))
    model = loadedPipeline.fit(traningDataset)
    expected = model.transform(testingDataset)

    model.write().overwrite().save("file://" +
                                   os.path.abspath("build/w2v_pipeline_model"))
    loadedModel = PipelineModel.load(
        "file://" + os.path.abspath("build/w2v_pipeline_model"))
    result = loadedModel.transform(testingDataset)

    unit_test_utils.assert_data_frames_are_identical(expected, result)
Example #2
0
    def featureExtractLr(self, trainDataframe, predictionDataframe):
        pipeline = None
        try:
            # pipeline = PipelineModel.load(ROOT_PATH+'/logistic')
            pipeline = Pipeline.load(ROOT_PATH + '/logistic')
        except Exception:
            print Exception.message
            self.logger.error(Exception)
        if pipeline is None:
            # tokenizer = Tokenizer(inputCol="keywords", outputCol="words")
            remover = StopWordsRemover(inputCol="keywords",
                                       outputCol="filtered")
            # 设置停用词
            remover.setStopWords(self.cuttingMachine.chineseStopwords())
            hashingTF = HashingTF(inputCol=remover.getOutputCol(),
                                  outputCol="features")
            lr = LogisticRegression(maxIter=10,
                                    regParam=0.001).setElasticNetParam(0.8)
            pipeline = Pipeline(stages=[remover, hashingTF, lr])
        model = pipeline.fit(trainDataframe)
        pipeline.write().overwrite().save(ROOT_PATH + '/logistic')
        # model.write().overwrite().save(ROOT_PATH+'/logistic')
        resultDataframe = model.transform(predictionDataframe)
        resultDataframe.show()
        selected = resultDataframe.select("id", "features", "probability",
                                          "prediction")

        for row in selected.collect():
            rid, features, prob, prediction = row
            self.logger.info("features: %s", features)
            self.logger.info("prob: %s", str(prob))
            self.logger.info("prediction: %s", str(prediction))
Example #3
0
class SentimentalPipelineEngine(PipelineEngine):
    def __init__(self, cv):
        super(SentimentalPipelineEngine, self).__init__(cv)
        self.tokenizer_map = [TweetTokenizer()]
        self.ngram_map = [1]
        self.hashing_tf_map = [pow(2, 20)]
        self.clf_map = [0.1]
        self.stages = self._build_stages()
        self.pipeline = Pipeline(stages=self.stages)
        self.param_grid = self._build_param_grid()

    def _build_stages(self):
        self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed")
        self.tokenizer = Tokenizzzer(inputCol=self.bs_parser.getOutputCol(), outputCol="words")
        self.stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered")
        self.porter = PorterStemmerTransformer(inputCol=self.stopwords_remover.getOutputCol(), outputCol="stemmed")
        self.ngram = NGram(inputCol=self.porter.getOutputCol(), outputCol="ngrams")
        self.hashing_tf = HashingTF(inputCol=self.ngram.getOutputCol(), outputCol="features")
        self.idf = IDF(inputCol="features", outputCol="idf_features")
        self.normalizer = Normalizer(inputCol="idf_features", outputCol="norm_features", p=1.0)
        self.clf = LogisticRegression(featuresCol='norm_features', regParam=0.1)
        # self.clf = MultilayerPerceptronClassifier(featuresCol="norm_features", maxIter=1000, layers=[self.hashing_tf.getNumFeatures(), 200, 100, 2])
        return [self.bs_parser, self.tokenizer, self.stopwords_remover, self.porter, self.ngram, self.hashing_tf, self.idf, self.normalizer, self.clf]

    def _build_param_grid(self):
        param_grid_builder = ParamGridBuilder()
        param_grid_builder.addGrid(self.tokenizer.tokenizer, self.tokenizer_map)
        param_grid_builder.addGrid(self.ngram.n, self.ngram_map)
        param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map)
        param_grid_builder.addGrid(self.clf.regParam, self.clf_map)
        return param_grid_builder.build()
Example #4
0
    def featureExtract(self, trainDataframe, predictionDataframe):
        pipeline = None
        try:
            pipeline = Pipeline.load(ROOT_PATH + '/pipeline')
        except Exception:
            print Exception.message
            self.logger.error(Exception)
        if pipeline is None:
            # tokenizer = Tokenizer(inputCol="keywords", outputCol="words")
            remover = StopWordsRemover(inputCol="keywords",
                                       outputCol="filtered")
            # 设置停用词
            remover.setStopWords(self.cuttingMachine.chineseStopwords())
            hashingTF = HashingTF(inputCol=remover.getOutputCol(),
                                  outputCol="features")
            idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idff")
            # lr = LogisticRegression(maxIter=10, regParam=0.001)
            pipeline = Pipeline(stages=[remover, hashingTF, idf])
        model = pipeline.fit(trainDataframe)
        pipeline.write().overwrite().save(ROOT_PATH + '/pipeline')
        resultDataframe = model.transform(predictionDataframe)
        resultDataframe.show()
        selected = resultDataframe.select("filtered", "features", "idff")

        for row in selected.collect():
            filtered, features, idff = row
            self.logger.info("features: %s", features)
            self.logger.info("idff: %s", idff)
            self.logger.info(
                "filtered: %s",
                str(filtered).decode("unicode_escape").encode("utf-8"))
        return selected
Example #5
0
def fit_kmeans(spark, products_df):
    step = 0

    step += 1
    tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer")

    step += 1
    stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords")

    step += 1
    tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16)

    step += 1
    idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf")

    step += 1
    normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer")

    step += 1
    kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20)

    kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans])

    model = kmeans_pipeline.fit(products_df)
    words_prediction = model.transform(products_df)
    model.save("./kmeans")  # the whole machine learning instance is saved in a folder
    return model, words_prediction
Example #6
0
def main(*args):
    if len(args) != 2:
        print("Please provide one input and one output directories!")
        sys.exit(1)

    input_fn, output_fn = args[0],args[1]
    conf = SparkConf()
    conf.setAppName("grant")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    # Load the abstract content in the test folder into spark, 
    # clean text, tokenize the corpus, and stem the words
    abstract = sc.textFile(input_fn)
    df_abs = (abstract.map(lambda doc: text_cleaning(doc))
                      .filter(lambda doc: len(doc) > 0)
                      .filter(lambda line: not line.startswith('app'))
                      .map(lambda doc: doc.split(' '))
                      .map(lambda word: [x for x in word if len(x)>0])
                      .map(lambda word: stem(word))
                      .map(lambda doc: (int(doc[0]), doc[1:]))
                      .filter(lambda doc: len(doc[1])>0)
                      .toDF(['Id','words']))
    # build the pipeline and lda model with online optimizer
    stop_words = StopWordsRemover(inputCol='words',
                             outputCol='clean')
    stop_words.setStopWords(stop_words.loadDefaultStopWords('english'))
    countv = CountVectorizer(inputCol=stop_words.getOutputCol(), 
                             outputCol="tokens")
    idf = IDF(inputCol=countv.getOutputCol(),outputCol="features")
    lda = LDA(maxIter=10,k=10,optimizer='online')
    pipeline = Pipeline(stages=[stop_words, countv, idf, lda])
    lda_model = pipeline.fit(df_abs)
    labels = lda_model.transform(df_abs)
    
    # identify the label as the topic with the max probability
    # save the label to file
    topic_labels = (labels.select('Id','topicDistribution')
                          .rdd
                          .map(lambda x: (x[0],np.argmax(x[1])))
                          .saveAsTextFile(os.path.join(output_fn,'labels')))
    # Get the topics
    wordnum = 5 # choose the number of topic words
    vocabulary = lda_model.stages[1].vocabulary
    voc_bv = sc.broadcast(vocabulary)
    topic_df = (lda_model.stages[3].describeTopics(wordnum)
                     .rdd
                     .map(lambda x: (x[0],[voc_bv.value[Id] for Id in x[1]],x[2]))
                     .saveAsTextFile(os.path.join(output_fn,'words')))
Example #7
0
def pipeline(cleaned_dataframe, stopwordlist=None):
    """Pipeline for Tokenizing, removing stop words, and performing word count."""
    tokenizer = Tokenizer(inputCol="Text", outputCol="Text_tokens")
    if stopwordlist:
        stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                        outputCol="Text_tokens_stopped",
                                        stopWords=stopwordlist)
    else:
        stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                        outputCol="Text_tokens_stopped")

    count_vect = CountVectorizer(inputCol=stop_remover.getOutputCol(),
                                 outputCol="features")

    pipe_line = Pipeline(stages=[tokenizer, stop_remover, count_vect])
    model = pipe_line.fit(cleaned_dataframe)
    featurized_data = model.transform(cleaned_dataframe)

    return featurized_data, model.stages[-1].vocabulary
Example #8
0
def main(spark, numTopics):

    jokesDF = spark.read.schema(
        StructType([
            StructField("jokeID", IntegerType(), False),
            StructField("raw_text", StringType(), False),
        ])).csv("s3://aws-emr-resources-257018485161-us-east-1/jokes_3.csv",
                header="true")

    #jokesDF = jokesDF.withColumn("text", clean_text_udf("raw_text"))

    (training, test) = jokesDF.randomSplit([0.8, 0.2])

    register_remove_punctuation_udf(spark)

    stopwords = spark.sparkContext.textFile(
        "s3://aws-emr-resources-257018485161-us-east-1/stopwords").collect()

    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    remover = StopWordsRemover(stopWords=stopwords,
                               inputCol=tokenizer.getOutputCol(),
                               outputCol="filtered")
    vectorizer = CountVectorizer(inputCol=remover.getOutputCol(),
                                 outputCol="features",
                                 minDF=2)
    lda = LDA(k=numTopics)

    pipeline = Pipeline(stages=[
        SQLTransformer(
            statement=
            "SELECT jokeID, remove_punctuation_udf(raw_text) text FROM __THIS__"
        ), tokenizer, remover, vectorizer, lda
    ])

    model = pipeline.fit(training)
    model.write().overwrite().save(
        "s3://aws-emr-resources-257018485161-us-east-1/ldaPipelineModel")

    prediction = model.transform(test)

    prediction.show()
def create_pipeline(model_type, num_features=10000):
    """
    Defines pipeline from BOW to prediction.
    """

    remover = StopWordsRemover(inputCol="bow", outputCol="words")
    hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts", numFeatures=num_features)
    tfidf = IDF(inputCol=hashingTF.getOutputCol(),
                outputCol="features")

    if model_type == 'log_reg':
        model = LogisticRegression()
    elif model_type == 'gbt':
        model = GBTClassifier()
    elif model_type == 'naive_bayes':
        model = NaiveBayes()
    elif model_type == 'rf':
        model = RandomForestClassifier()

    return Pipeline(stages=[remover, hashingTF, tfidf,
                                model])
Example #10
0
def benchmark_body_pipeline(cleaned_dataframe, stopwordlist=None):
    """NLP pipeline. Tokenizes, removes stopwords, and computes TF-IDF
    Returns transformed data as 'features' and the vocabulary of words."""

    tokenizer = Tokenizer(inputCol="Text", outputCol="Text_tokens")
    if stopwordlist:
        stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                        outputCol="Text_tokens_stopped",
                                        stopWords=stopwordlist)
    else:
        stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                        outputCol="Text_tokens_stopped")

    count_vect = CountVectorizer(inputCol=stop_remover.getOutputCol(),
                                 outputCol="Text_counts_raw")
    idf = IDF(inputCol=count_vect.getOutputCol(), outputCol="features")

    pipeline = Pipeline(stages=[tokenizer, stop_remover, count_vect, idf])
    model = pipeline.fit(cleaned_dataframe)
    featurized_data = model.transform(cleaned_dataframe)

    return featurized_data, model.stages[-2].vocabulary
Example #11
0
class NaiveBayesModel:
	"""
	Creates a Naive Bayes model using pipelines
	"""
	def __init__(self, training_data):
		self.training_data = training_data

		self.regex_tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")
		self.remover = StopWordsRemover(inputCol=self.regex_tokenizer.getOutputCol(), outputCol="filtered")
		self.hashing_tf = HashingTF(inputCol=self.remover.getOutputCol(), outputCol="features")

		#column names "features" and "labels" are defaults in the spark ml NB API
		#so no need to specify columns to run model on
		self.naive_bayes = NaiveBayes(smoothing=1.0, modelType="multinomial")

		self.model = (
			Pipeline(stages=[
				self.regex_tokenizer,
				self.remover,
				self.hashing_tf,
				self.naive_bayes
			])
			.fit(training_data)
		)

	def get_model(self):
		return self.model

	def calculate_accuracy(self, test_data):
		predictions = self.model.transform(test_data)

		evaluator = MulticlassClassificationEvaluator(
			labelCol="label", predictionCol="prediction",
			metricName="accuracy"
		)

		accuracy = evaluator.evaluate(predictions)
		print("Model accuracy: %s" % accuracy)
Example #12
0
    def train_validate(self, df):
        # Split the data into training and test sets (30% held out for testing)
        (training, test) = df.randomSplit([0.7, 0.3])

        # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                   outputCol="filtered")
        hashingTF = HashingTF(numFeatures=10000,
                              inputCol=remover.getOutputCol(),
                              outputCol="features")

        ####################
        # lr = LogisticRegression(maxIter=10, regParam=0.001)
        # pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, lr])
        ####################

        # instantiate the base classifier.
        lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)
        # instantiate the One Vs Rest Classifier.
        ovr = OneVsRest(classifier=lr)
        pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, ovr])
        #####################

        # Fit the pipeline to training documents.
        model = pipeline.fit(training)

        # Make predictions on test documents and print columns of interest.
        prediction = model.transform(test)

        # obtain evaluator.
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

        # compute the classification error on test data.
        accuracy = evaluator.evaluate(prediction)
        print("Test Error : " + str(1 - accuracy))
        return model
Example #13
0
plt.title('Review Sentence Length')
plt.savefig(IMG_PATH + 'lengthSorted.png', format='png', transparent=True)
plt.show()

# %%
stopWords = list(set(nltk.corpus.stopwords.words('english'))) + ['']
tokenizer = Tokenizer(inputCol='review', outputCol='tokens')
stopWordRemover = StopWordsRemover(
    inputCol=tokenizer.getOutputCol(),
    outputCol='stoppedWords').setStopWords(stopWords)
pipeline = Pipeline(stages=[tokenizer, stopWordRemover])
dataSet = pipeline.fit(dataSet).transform(dataSet)

# %%
newLengthDF = dataSet.withColumn('newLength',
                                 F.size(stopWordRemover.getOutputCol()))

# %%
newSentenceLen = newLengthDF.select('class', 'newLength').collect()

#%%
y = [
    int(row['newLength']) for row in reviewLengths if (int(row['class']) == 0)
]
mx, mn = max(y), min(y)
ptp = mx - mn
y1 = [(i - mn) / ptp for i in y]

y = [
    int(row['newLength']) for row in reviewLengths if (int(row['class']) == 1)
]
Example #14
0
    conn = S3Connection()
    sc = set_spark_context()
    sqc = SQLContext(sc)
    sm = SparkModel(sc, conn, rdd_path='meta_rdd.pkl')

    logging.basicConfig(format='%(asctime)s %(message)s')
    grid_search = logging.getLogger('main')
    grid_search.setLevel(logging.DEBUG)
    handler = logging.FileHandler('../logs/grid_search.txt')
    grid_search.addHandler(handler)

    bow_rdd = sm.RDD.map(lambda (key, (bow, meta)): (key, bow))
    bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow))

    remover = StopWordsRemover(inputCol="raw", outputCol="words")
    hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts",
                numFeatures=10000)
    tfidf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features",
                minDocFreq=20)
    indexer = StringIndexer(inputCol="string_label", outputCol="label")

    for model in [GBTClassifier(), RandomForestClassifier(), MultilayerPerceptronClassifier()]:

        if type(model) == MultilayerPerceptronClassifier:
            layers = [10000, 100, 2]
            model = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128)

        pipeline = Pipeline(stages=[remover, hashingTF, tfidf, # scaler,
                                    indexer, model])
        scores = cross_val_score(pipeline, bow_rdd)
        grid_search.debug('Model: %s\nscores: %s\nAverage: %s' \
reviews_mini = update_text_with_key_ngrams(reviews_mini,
                                           n=2,
                                           seed=42,
                                           outputCol=outputCol,
                                           pattern=pattern)
print("\n")

## PREDICT LABEL BASED ON TF-IDF OF UPDATED TEXT
print("Computing TF-IDF matrix for updated text...")
tokenizer = RegexTokenizer(inputCol=outputCol,
                           outputCol="words_with_ngrams",
                           pattern=pattern,
                           gaps=False)
stop_words_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                      outputCol="filtered_words")
cv = CountVectorizer(inputCol=stop_words_remover.getOutputCol(),
                     outputCol="final_tf")
idf = IDF(inputCol=cv.getOutputCol(),
          outputCol="final_tf_idf")

pipe = Pipeline(stages=[
    tokenizer,
    stop_words_remover,
    cv,
    idf
])

reviews_mini = pipe.fit(reviews_mini).transform(reviews_mini)

## Train test split
train, test = reviews_mini.randomSplit([0.8, 0.2], seed=seed)
Example #16
0
sc = SparkContext("local", "Simple App")
spark = SparkSession.builder.master("local").appName("Word Count").config(
    "spark.some.config.option", "some-value").getOrCreate()

df = spark.read.csv('file:///home/zfar/Sentiment Analysis Dataset.csv',
                    header=True)

df = df.select(df['ItemID'], df['SentimentText'], df['label'])

training = df.selectExpr("cast(itemID as int) id", "SentimentText",
                         "cast(label as int) label")

tokenizer = Tokenizer(inputCol="SentimentText", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                           outputCol="filtered")
ngrams = NGram(n=2, inputCol=remover.getOutputCol(), outputCol="ngrams")
hashingTF = HashingTF(inputCol=ngrams.getOutputCol(), outputCol="rawfeatures")
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idffeatures")
normalizer = Normalizer(inputCol=idf.getOutputCol(),
                        outputCol="features",
                        p=1.0)

#lr = LogisticRegression(maxIter=10, regParam=0.001)
nb = NaiveBayes(smoothing=1.0)
pipeline = Pipeline(
    stages=[tokenizer, remover, ngrams, hashingTF, idf, normalizer, nb])
model = pipeline.fit(training)
"""
paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [10, 100, 1000]).addGrid(lr.regParam, [0.1, 0.01]).build()

crossval = CrossValidator(estimator=pipeline,
## Tokenize the messages
tokenizer = RegexTokenizer(inputCol="text",
                           outputCol="words",
                           minTokenLength=3,
                           gaps=False,
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                    outputCol="filtered",
                                    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
                                    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="tf_idf",
          minDocFreq=4)

## Create H2OAutoML model
automl = H2OAutoML(convertUnknownCategoricalLevelsToNa=False,
                   seed=1,
                   maxRuntimeSecs=300, # 5 minutes
                   predictionCol="label")

## Remove all helper columns
Example #18
0

nltk.download('stopwords')

# list of stopwords to be removed from the posts
StopWords = list(set(stopwords.words('english')))

labelIndexer = StringIndexer(inputCol="tags", outputCol="label").fit(train)
bs_text_extractor = BsTextExtractor(inputCol="post", outputCol="untagged_post")
RegexTokenizer = RegexTokenizer(inputCol=bs_text_extractor.getOutputCol(),
                                outputCol="words",
                                pattern="[^0-9a-z#+_]+")
StopwordRemover = StopWordsRemover(
    inputCol=RegexTokenizer.getOutputCol(),
    outputCol="filtered_words").setStopWords(StopWords)
CountVectorizer = CountVectorizer(inputCol=StopwordRemover.getOutputCol(),
                                  outputCol="countFeatures",
                                  minDF=5)
idf = IDF(inputCol=CountVectorizer.getOutputCol(), outputCol="features")
rf = RandomForestClassifier(labelCol="label",
                            featuresCol=idf.getOutputCol(),
                            numTrees=100,
                            maxDepth=4)
idx_2_string = IndexToString(inputCol="prediction", outputCol="predictedValue")
idx_2_string.setLabels(labelIndexer.labels)

# creating the pipeline
pipeline = Pipeline(stages=[
    labelIndexer, bs_text_extractor, RegexTokenizer, StopwordRemover,
    CountVectorizer, idf, rf, idx_2_string
])
Example #19
0
###############################################################################################
# Pipeline
###############################################################################################
# Tokenize by word
tokenizer = Tokenizer(inputCol="text", outputCol="words")
# Remove stop words in the text
stopword = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                            outputCol="no_stops",
                            stopWords=swords)
# The cheaper way to do TF-IDF
# Creates a hash that contains the term frequency
# This mean there are no pairs with the value 0
# It'll output: (number_of_words {index_from_previous: value, ...}) with no value = 0
# If the value is 0, the index_from_previous will skip so there can be key that go
# 0, 1, 6, 8, ... etc all based on the contents of the previous step
hashingTF = HashingTF(inputCol=stopword.getOutputCol(), outputCol="hashing")
# Performs the IDF part in TF-IDF
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="features1",
          minDocFreq=5)
# Appends output Token-Stopwords-HashingTF-IDF with output of Vader
assembler = VectorAssembler(inputCols=["features1", "vader"],
                            outputCol="features")
# Initialize Logistic Regression
lr = LogisticRegression(maxIter=10, regParam=0.001)
# Creates pipeline
pipeline = Pipeline(
    stages=[tokenizer, stopword, hashingTF, idf, assembler, lr])

###############################################################################################
# Fit model to training set
class SentAnalysisModelTraining(object):
    def __init__(self):
        print("Initializing SentAnalysisModelTraining!!");
        self.appName = "Sentiment Analysis Model Training"
        # create Spark session
        self.spark = SparkSession.builder.appName(self.appName) \
            .config("spark.executor.heartbeatInterval", "200000") \
            .config("spark.network.timeout", "300000") \
            .getOrCreate()
        self.modelpath = "sentiment.model"
        self.data = None
        self.training_data = None
        self.testing_data = None
        self.pipeline = None
        self.cross_validator = None
        self.evaluator = None

        self.predict_training_data = None
        self.predict_testing_data = None
        self.cv_model = None
        self.labeled_output = None
        self.tokenizer = None
        self.stopwordsRemover = None
        self.hashTF = None
        self.labededIndexer = None
        return



    # read csv file into dataFrame with automatically inferred schema
    def read_data(self):
        emotion_csv = self.spark.read.csv('dataset/text_emotions1.csv', inferSchema=True, header=True)
        emotion_csv.show(truncate=False, n=10)
        emotion_csv.select("sentiment").distinct().show()
        print(emotion_csv)

        # select only "content" and "sentiment" column,
        # and cast "Sentiment" column data into integer
        self.data = emotion_csv.select("content", "sentiment")
        self.data.show(truncate=False, n=10)

        return self.data.count

    # divide data, 70% for training, 30% for testing
    def split_data(self):
        dividedData = self.data.randomSplit([0.9, 0.1])
        self.training_data = dividedData[0]  # index 0 = data training
        self.testing_data = dividedData[1]  # index 1 = data testing
        train_rows = self.training_data.count()
        test_rows = self.testing_data.count()
        print("Training data rows:", train_rows, "; Testing data rows:", test_rows)
        return

    def create_pipeline(self):
        # Creating all the pipeline elements
        self.tokenizer = Tokenizer(inputCol="content", outputCol="SentimentWords")
        labelStringIdx = StringIndexer(inputCol="sentiment", outputCol="label")
        self.labededIndexer = labelStringIdx.fit(self.training_data)

        self.stopwordsRemover = StopWordsRemover(inputCol=self.tokenizer.getOutputCol(), outputCol="RelevantWords")
        self.hashTF = HashingTF(inputCol=self.stopwordsRemover.getOutputCol(), outputCol="features")
        self.pipeline = Pipeline(stages=[self.tokenizer, labelStringIdx, self.stopwordsRemover, self.hashTF])

        lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=15, regParam=0.001, \
                                elasticNetParam=0.8, family="multinomial")

        self.evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
        # paramGrid = ParamGridBuilder()\
        #    .addGrid(lr.aggregationDepth,[2,5,10])\
        #    .addGrid(lr.elasticNetParam,[0.0, 0.5, 1.0])\
        #    .addGrid(lr.fitIntercept,[False, True])\
        #    .addGrid(lr.maxIter,[10, 15])\
        #    .addGrid(lr.regParam,[0.01, 0.1]) \
        #    .build()

        paramGrid = ParamGridBuilder() \
            .addGrid(lr.aggregationDepth, [2]) \
            .addGrid(lr.elasticNetParam, [0.8]) \
            .addGrid(lr.maxIter, [15]) \
            .addGrid(lr.regParam, [0.001]) \
            .build()

        # Create 5-fold CrossValidator
        self.cross_validator = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=self.evaluator, numFolds=5)

        return

    # Fit the pipeline to training documents.
    def train_model(self):
        pipelineFit = self.pipeline.fit(self.training_data)
        dataset = pipelineFit.transform(self.training_data)

        # Run cross validations
        self.cv_model = self.cross_validator.fit(dataset)

        # this will likely take a fair amount of time because of the amount of models that we're creating and testing
        self.predict_training_data = self.cv_model.transform(dataset)
        print("The area under ROC for train set after CV  is {}".format(self.evaluator.evaluate(self.predict_training_data)))
        print("Training is done!")
        return

    # Fit the pipeline to training documents.
    def test_model(self):
        pipelineFit = self.pipeline.fit(self.testing_data)
        preparedTestData = pipelineFit.transform(self.testing_data)
        self.predict_testing_data = self.cv_model.transform(preparedTestData)
        print("The area under ROC for test set after CV  is {}".format(self.evaluator.evaluate(self.predict_testing_data)))

        print("Testing is done!")
        return



    def print_model_summary(self):
        predictionFinal = self.predict_training_data.select(
            "RelevantWords", "prediction", "label", "sentiment")
        predictionFinal.show(n=20, truncate=False)

        # check the accuracy
        correctPrediction = self.predict_testing_data.filter(
            self.predict_testing_data['prediction'] == self.predict_testing_data['label']).count()
        totalData = self.predict_testing_data.count()
        print("correct prediction:", correctPrediction, ", total data:", totalData,
              ", accuracy:", correctPrediction / totalData)

        trainingSummary = self.cv_model.bestModel.summary

        # Obtain the objective per iteration
        objectiveHistory = trainingSummary.objectiveHistory
        # print("objectiveHistory:")
        # for objective in objectiveHistory:
        #    print(objective)

        # for multiclass, we can inspect metrics on a per-label basis
        print("False positive rate by label:")
        for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
            print("label %d: %s" % (i, rate))

        print("True positive rate by label:")
        for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
            print("label %d: %s" % (i, rate))

        print("Precision by label:")
        for i, prec in enumerate(trainingSummary.precisionByLabel):
            print("label %d: %s" % (i, prec))

        print("Recall by label:")
        for i, rec in enumerate(trainingSummary.recallByLabel):
            print("label %d: %s" % (i, rec))

        print("F-measure by label:")
        for i, f in enumerate(trainingSummary.fMeasureByLabel()):
            print("label %d: %s" % (i, f))

        accuracy = trainingSummary.accuracy
        falsePositiveRate = trainingSummary.weightedFalsePositiveRate
        truePositiveRate = trainingSummary.weightedTruePositiveRate
        fMeasure = trainingSummary.weightedFMeasure()
        precision = trainingSummary.weightedPrecision
        recall = trainingSummary.weightedRecall
        print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
              % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))
        return

    def save_model_pipeline(self):
        # pipeline for later deriving value from actual field.
        labelConverter = IndexToString(inputCol="prediction", outputCol="predictionLabel", labels=self.labededIndexer.labels)
        pipeline = Pipeline(stages=[self.tokenizer, self.stopwordsRemover, self.hashTF, self.cv_model.bestModel, labelConverter])
        pipeline.write().overwrite().save("prediction_pipeline")
Example #21
0
# Do the writing
trainingFile = open(outputFile, 'wt', encoding="utf-8",newline='')
writer = csv.writer(trainingFile)
writer.writerows(newRows)
trainingFile.close()

trainingFile = open(outputFile,"r")
pandas_df = pd.read_csv(trainingFile)
spark_df = sqlContext.createDataFrame(pandas_df)

#Pipeline
countTokens = udf(lambda words: len(words), IntegerType())
tokenizer = Tokenizer(inputCol="text", outputCol="new_text")
add_stopwords = ["http","https","amp","RT","the"] 
stopwordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered").setStopWords(add_stopwords)
cv = CountVectorizer(inputCol=stopwordsRemover.getOutputCol(), outputCol="features", vocabSize=3, minDF=2.0)
lr= LogisticRegression(maxIter=20, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, stopwordsRemover, cv, lr])
model = pipeline.fit(spark_df)

#Sentiments Predicted
testingData=sqlContext.createDataFrame(testing_df)
prediction = model.transform(testingData)
selected = prediction.select("text", "probability", "prediction")
for row in selected.collect():
    text, prob, prediction = row
    print("(%s) --> prob=%s, prediction=%d" % (text, str(prob), prediction))
	
#Evaluating the model's accuracy
(train, test) = spark_df.randomSplit([0.7, 0.3], seed = 100)
print("Training data count: "+str(train.count()))
# %%
trainDF, testDF = sql.SQLContext(spark.sparkContext).createDataFrame(trainDF), sql.SQLContext(spark.sparkContext).createDataFrame(testDF)
# trainDF.show()
# testDF.show()

#%%
print(trainDF.count())
print(trainDF.filter(F.col('class') == 1).count())
print(trainDF.filter(F.col('class') == 0).count())

# %%
stopWords = list(set(nltk.corpus.stopwords.words('english'))) + ['']

tokenizer = Tokenizer(inputCol='review', outputCol='tokens')
stopWordRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='stoppedWords').setStopWords(stopWords)
countVector = CountVectorizer(inputCol=stopWordRemover.getOutputCol(), outputCol='vectors')
idf = IDF(inputCol=countVector.getOutputCol(), outputCol='idf')
pipline = Pipeline(stages=[tokenizer, stopWordRemover, countVector, idf])
model = pipline.fit(trainDF)
ptrainDF = model.transform(trainDF)
ptestDF = model.transform(testDF)
ptrainDF.show()

# %%
evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="f1")

# %%
lr = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol='class')
lrModel = lr.fit(ptrainDF)
predictionsLR = lrModel.transform(ptestDF)
evaluator.evaluate(predictionsLR)
Example #23
0
        notes_df['rowid'],
        remove_features_udf(notes_df['text'].cast("string")).alias('text')))

    lemmatized_notes_df = filtered_notes_df.withColumn(
        "lemmatized", lemmatize_udf(filtered_notes_df['text']))

    # Defining our ML pipeline.
    tokenizer = RegexTokenizer(inputCol='lemmatized',
                               outputCol='tokens',
                               pattern='\\W')
    common_words = ['admission', 'discharge'
                    ] + StopWordsRemover().getStopWords()
    remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                               stopWords=common_words,
                               outputCol='tokens_filtered')
    cv = CountVectorizer(inputCol=remover.getOutputCol(), outputCol='features')
    lda = LDA().setK(3)
    pipeline = Pipeline(stages=[tokenizer, remover, cv, lda])

    # Fitting our pipeline.
    model = pipeline.fit(lemmatized_notes_df)

    # Here we access the last stage of the model, as this is where we applied the LDA.
    lda_model = model.stages[-1]
    cv_model = model.stages[2]

    topics = lda_model.describeTopics()
    topics_with_words = (topics.select(
        topics["topic"],
        make_indices_mapping(cv_model.vocabulary)(
            topics["termIndices"]).alias("terms"), topics["termWeights"]))
## Tokenize the messages
tokenizer = RegexTokenizer(inputCol="text",
                           outputCol="words",
                           minTokenLength=3,
                           gaps=False,
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                    outputCol="filtered",
                                    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
                                    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="tf_idf",
          minDocFreq=4)


if algo == "gbm":
    ## Create GBM model
    algoStage = H2OGBM(ratio=0.8,
                 seed=1,
                 featuresCols=[idf.getOutputCol()],
                 predictionCol="label")
Example #25
0
# Make predictions on the testing data
predictions = pipeline.transform(flights_test)

--------------------------------------------------
# Exercise_3 
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash")
idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])

--------------------------------------------------
# Exercise_4 
# Create an empty parameter grid
params = ParamGridBuilder().build()

# Create objects for building and evaluating a regression model
regression = LinearRegression(labelCol='duration')
evaluator = RegressionEvaluator(labelCol='duration')
Example #26
0
    ]).map(lambda x: x.split(' ')).map(lambda y: Row(y)).toDF(
        schema=StructType().add("text", StringType(), True)))

    positive_string = (sc.parallelize([
        "im so happy with my results"
    ]).map(lambda x: x.split(' ')).map(lambda y: Row(y)).toDF(
        schema=StructType().add("text", StringType(), True)))

    test_data_frame.cache()

    regex_tokenizer = RegexTokenizer(inputCol="text",
                                     outputCol="words",
                                     pattern="\\W")
    remover = StopWordsRemover(inputCol=regex_tokenizer.getOutputCol(),
                               outputCol="filtered")
    hashing_tf = HashingTF(inputCol=remover.getOutputCol(),
                           outputCol="features")

    #Column names "features" and "labels" are defaults in the ml NB API
    #so no need to specify columns to run model on
    naive_bayes = NaiveBayes(smoothing=1.0, modelType="multinomial")

    #Can just pipeline the DF, no need to turn into labelled point!
    pipeline = Pipeline(
        stages=[regex_tokenizer, remover, hashing_tf, naive_bayes])

    #form the model
    model = pipeline.fit(training_data_frame)

    #run the prediciton
    predictions = model.transform(test_data_frame)
Example #27
0
# MAGIC %md ### Define the Pipeline
# MAGIC The pipeline for the model consist of the following stages:
# MAGIC - A Tokenizer to split the tweets into individual words.
# MAGIC - A StopWordsRemover to remove common words such as "a" or "the" that have little predictive value.
# MAGIC - A HashingTF class to generate numeric vectors from the text values.
# MAGIC - A LogisticRegression algorithm to train a binary classification model.

# COMMAND ----------

# convert sentence to words' list
tokenizer = Tokenizer(inputCol="text", outputCol="SentimentWords")
# remove stop words
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                       outputCol="MeaningfulWords")
# convert word to number as word frequency
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
# set the model
lr = LogisticRegression(labelCol="label",
                        featuresCol="features",
                        maxIter=10,
                        regParam=0.01)

# process pipeline with the series of transforms - 4 transforms
pipeline = Pipeline(stages=[tokenizer, swr, hashTF, lr])

# COMMAND ----------

# MAGIC %md ### Run the Pipeline as an Estimator
# MAGIC The pipeline itself is an estimator, and so it has a **fit** method that we called to run the pipeline on a specified DataFrame. In this case, we ran the pipeline on the training data to train a model.

# COMMAND ----------
Example #28
0
## Tokenize the messages
tokenizer = RegexTokenizer(inputCol="text",
                           outputCol="words",
                           minTokenLength=3,
                           gaps=False,
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(
    inputCol=tokenizer.getOutputCol(),
    outputCol="filtered",
    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4)

## Create H2ODeepLearning model
dl = H2ODeepLearning(epochs=10,
                     l1=0.001,
                     l2=0.0,
                     hidden=[200, 200],
                     featuresCols=[idf.getOutputCol()],
                     predictionCol="label")

## Remove all helper columns
Example #29
0
    StructField("id", IntegerType(), True),
    StructField("text", StringType(), True),
    StructField("label", DoubleType(), True)
]
finalSchema = StructType(fields=newDF)
dataset = sqlContext.read.format('csv').options(
    header='true', schema=finalSchema,
    delimiter='|').load('/FileStore/tables/dataset.csv')

dataset = dataset.withColumn("label", dataset["label"].cast(DoubleType()))
dataset = dataset.withColumn("id", dataset["id"].cast(IntegerType()))
training, test = dataset.randomSplit([0.8, 0.2], seed=12345)

tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=2, regParam=0.001)
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, lr])

# Fit the pipeline to training documents.
model = pipeline.fit(training)
result = model.transform(test)\
    .select("features", "label", "prediction")
correct = result.where(result["label"] == result["prediction"])
accuracy = correct.count() / test.count()
print("Accuracy of model = " + str(accuracy))
test_error = 1 - accuracy
print("Test error = " + str(test_error))
evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                              predictionCol="prediction",
Example #30
0
                      outputCol='body')
cleaner = Cleaner(key='subreddit',
                  val='body',
                  inputCol=extractor.getOutputCol(),
                  outputCol='body')
filterer = Filterer(key='subreddit',
                    val='body',
                    inputCol='subreddit',
                    outputCol='body',
                    minlength=args.minlength)
tokenizer = RegexTokenizer(inputCol=cleaner.getOutputCol(),
                           outputCol="tokens",
                           pattern="\\W")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                           outputCol="swr_tokens")
cv = CountVectorizer(inputCol=remover.getOutputCol(),
                     outputCol="tf",
                     minDF=args.mindf,
                     vocabSize=args.vocabsize)
idf = IDF(inputCol=cv.getOutputCol(), outputCol="tfidf")
topkwords = TopKWords(inputCol=idf.getOutputCol(),
                      outputCol='top_words',
                      nwords=args.nwords)
cos_similarity = CosineSimilarity(inputCol='subreddit',
                                  outputCol='norm',
                                  spark=spark)
topksubreddits = TopKSubreddits(inputCol=cos_similarity.getOutputCol(),
                                outputCol='top_subreddits',
                                nsubreddits=args.nsubreddits)

pipeline = Pipeline(stages=[
Example #31
0
from pyspark.sql.functions import udf,col
from pyspark.ml.feature import CountVectorizer, IDF, StopWordsRemover, RegexTokenizer
from pyspark.ml.clustering import LDA

spark = SparkSession.builder.getOrCreate()

data = pd.read_csv('https://raw.githubusercontent.com/DaiZack/MLdatasets/master/imdb500.csv')
df = spark.createDataFrame(data)
textCol = 'review'
selfstopwords = ['br']
numOfTopics = 10
numOfKeywords = 5

tokenizer = RegexTokenizer(inputCol=textCol, outputCol='token', pattern='\\W+')
stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='clean0')
stopwords1 = StopWordsRemover(inputCol=stopwords.getOutputCol(), stopWords=selfstopwords,outputCol='clean')
cv = CountVectorizer(inputCol=stopwords1.getOutputCol(), outputCol='cv')
idf = IDF(inputCol=cv.getOutputCol(), outputCol='idf')
lda = LDA(featuresCol=idf.getOutputCol(), k=numOfTopics, maxIter=10)

pipe1 = Pipeline(stages=[tokenizer, stopwords,stopwords1,cv,idf, lda])

model = pipe1.fit(df)
output = model.transform(df)

def topicsTerms(vocab, termindices, leng=None):
  if not leng:
    return [voca[t] for t in termindices]
  return [vocab[t] for t in termindices][:leng]

def topicsTerm_udf(vocab, leng=None):
Example #32
0
from pyspark.sql.functions import udf, col, explode, collect_list, to_date, concat
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, \
        FloatType, ArrayType, BooleanType
from nltk.stem import SnowballStemmer


# Import json objects from tar file
opinion_df = import_dataframe(spark, 'opinion')
docket_df = import_dataframe(spark, 'docket')
cluster_df = import_dataframe(spark, 'cluster')

# Setup pipeline for adding ML features - tokens, stems, n-grams, tf, tfidf, word2vec
# tokenizer = Tokenizer(inputCol='parsed_text', outputCol='tokens')
tokenizer = RegexTokenizer(inputCol="parsed_text", outputCol="raw_tokens", pattern="\\W", minTokenLength=3)
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='tokens_stop')
stemmer = Stemming_Transformer(inputCol=remover.getOutputCol(), outputCol='tokens')
bigram = NGram(inputCol=stemmer.getOutputCol(), outputCol='bigrams', n=2)
trigram = NGram(inputCol=stemmer.getOutputCol(), outputCol='trigrams', n=3)
cv = CountVectorizer(inputCol=stemmer.getOutputCol(), outputCol='token_countvector', minDF=10.0)
idf = IDF(inputCol=cv.getOutputCol(), outputCol='token_idf', minDocFreq=10)
w2v_2d = Word2Vec(vectorSize=2, minCount=2, inputCol=stemmer.getOutputCol(), outputCol='word2vec_2d')
w2v_large = Word2Vec(vectorSize=250, minCount=2, inputCol=stemmer.getOutputCol(), outputCol='word2vec_large')

pipe = Pipeline(stages=[tokenizer, remover, stemmer, cv, idf, w2v_2d, w2v_large])

# Use the pipeline to fit a model
model = pipe.fit(opinion_df)

# Use the model to transform the data
df_transformed = model.transform(opinion_df)
Example #33
0
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.regression import LinearRegression, GBTRegressor
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol='reviewText', outputCol='reviewWords')
stop_words_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='reviewWordsWithoutTrash')
vectorizer = CountVectorizer(inputCol=stop_words_remover.getOutputCol(), outputCol="word_vector", minDF=150)
lr = LinearRegression(featuresCol=vectorizer.getOutputCol(), labelCol='overall')

pipeline = Pipeline(stages=[tokenizer, stop_words_remover, vectorizer, lr])