def defineNB_model(): VOCAB_SIZE = 20000 MINDF = 3 TRAINING_ITERS = 150 tokenizer = pipeline_utils.TweetTokenizer(inputCol='text', outputCol='words') stopword_remover = ml.feature.StopWordsRemover( inputCol=tokenizer.getOutputCol(), outputCol='filtered', stopWords=list(all_stopwords)) stemmer = pipeline_utils.Stemmer( inputCol=stopword_remover.getOutputCol(), outputCol='cleaned_words') counter = ml.feature.CountVectorizer(inputCol=stemmer.getOutputCol(), outputCol='features', vocabSize=VOCAB_SIZE, minDF=MINDF) classifier = ml.classification.NaiveBayes(smoothing=1.0, modelType='multinomial') pipeline = ml.Pipeline( stages=[tokenizer, stopword_remover, stemmer, counter, classifier]) return pipeline, classifier
def tfidf_review_text(df): with Timer("TF-IDF for reviewText"): df = df.select(["reviewText"]).dropna() with Timer("TF-IDF pipeline"): tokenizer = ml.feature.Tokenizer(inputCol="reviewText", outputCol="token") cv = ml.feature.CountVectorizer(inputCol="token", outputCol="hash") idf = ml.feature.IDF(inputCol="hash", outputCol="tfidf") pipeline = ml.Pipeline(stages=[tokenizer, cv, idf]) model = pipeline.fit(df) df = model.transform(df) df.unpersist() # df.cache() stages = model.stages # print(f"stages: {stages}") vectorizers = [s for s in stages if isinstance(s, CountVectorizerModel)] vocab = [v.vocabulary for v in vectorizers] vocab = vocab[0] # print(f"Length of Vocab: {len(vocab[0])}") idx2word = {idx: word for idx, word in enumerate(vocab)} with Timer("Convert TF-IDF sparseVector to (word:value dict)"): my_udf_func = udf(lambda vector: sparse2dict(vector, idx2word), types.StringType()) df = df.select("reviewText", my_udf_func("tfidf").alias("tfidf")) return df
def define_combo_model(): VOCAB_SIZE = 10000 MINDF = 3 TRAINING_ITERS = 150 tokenizer = pipeline_utils.TweetTokenizer(inputCol='text', outputCol='words') stemmer = pipeline_utils.Stemmer(inputCol=tokenizer.getOutputCol(), outputCol='cleaned_words') stopword_remover = ml.feature.StopWordsRemover( inputCol=stemmer.getOutputCol(), outputCol='stopwords_removed', stopWords=list(all_stopwords)) unigram_ifidf = TFIDF_pipeline('unigram', stopword_remover.getOutputCol(), 15000) ngrammer = ml.feature.NGram(n=3, inputCol=stemmer.getOutputCol(), outputCol='trigrams') ngram_ifidf = TFIDF_pipeline('trigram', ngrammer.getOutputCol(), 5000) assembler = ml.feature.VectorAssembler( inputCols=['unigram_features', 'trigram_features'], outputCol='features') regresser = ml.classification.LogisticRegression( maxIter=TRAINING_ITERS, featuresCol='features', labelCol='label') pipeline = ml.Pipeline(stages=[ tokenizer, stemmer, stopword_remover, unigram_ifidf, ngrammer, ngram_ifidf, assembler, regresser ]) return pipeline, regresser
def define_bigram_model(): VOCAB_SIZE = 20000 MINDF = 3 TRAINING_ITERS = 150 tokenizer = pipeline_utils.TweetTokenizer(inputCol='text', outputCol='words') #stopword_remover = ml.feature.StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='filtered', stopWords=list(all_stopwords)) stemmer = pipeline_utils.Stemmer(inputCol=tokenizer.getOutputCol(), outputCol='cleaned_words') ngrammer = ml.feature.NGram(n=3, inputCol=stemmer.getOutputCol(), outputCol='ngrams') counter = ml.feature.CountVectorizer(inputCol=ngrammer.getOutputCol(), outputCol='counts', vocabSize=VOCAB_SIZE, minDF=MINDF) normalizer = ml.feature.Normalizer(p=1.0, inputCol=counter.getOutputCol(), outputCol='tf_normalized') df_normalize = ml.feature.IDF(inputCol=normalizer.getOutputCol(), outputCol='features') regresser = ml.classification.LogisticRegression( maxIter=TRAINING_ITERS, featuresCol='features', labelCol='label') pipeline = ml.Pipeline(stages=[ tokenizer, stemmer, ngrammer, counter, normalizer, df_normalize, regresser ]) return pipeline, regresser
def TFIDF_pipeline(prefix, inputCol, vocab_size, min_df=3): counter = ml.feature.CountVectorizer(inputCol=inputCol, outputCol=prefix + '_counts', vocabSize=vocab_size, minDF=min_df) normalizer = ml.feature.Normalizer(p=1.0, inputCol=counter.getOutputCol(), outputCol=prefix + '_tf_normalized') df_normalize = ml.feature.IDF(inputCol=normalizer.getOutputCol(), outputCol=prefix + '_features') return ml.Pipeline(stages=[counter, normalizer, df_normalize])
def run(sc, args): sc.setLogLevel('FATAL') arg_parser = argparse.ArgumentParser() arg_parser.add_argument('year', help='Year of prediction, in format YYYY.', type=int) arg_parser.add_argument('month', help='Month of prediction, in format MM.', type=int) arg_parser.add_argument('day', help='Day of prediction, in format DD.', type=int) args = arg_parser.parse_args(args) ss = sql.SparkSession(sc) latlongrid = grid.LatLonGrid(lat_min=40.488320, lat_max=40.957189, lon_min=-74.290739, lon_max=-73.635679, lat_step=grid.get_lon_delta( 1000, (40.957189 - 40.488320) / 2.0), lon_step=grid.get_lat_delta(1000)) tweets_df = import_twitter_data(ss, 'tweets2.csv') prediction_date = datetime.date(args.year, args.month, args.day) NUM_DAYS_IN_HISTORY = 31 history_cutoff = prediction_date - datetime.timedelta( days=NUM_DAYS_IN_HISTORY) filtered_tweets_df = filter_by_dates(ss, tweets_df, history_cutoff, prediction_date) tokens_df = group_by_grid_square_and_tokenize(ss, latlongrid, filtered_tweets_df) hashing_tf = feature.HashingTF(numFeatures=(2 ^ 18) - 1, inputCol='tokens', outputCol='token_frequencies') lda = (clustering.LDA().setFeaturesCol('token_frequencies').setK( 10).setTopicDistributionCol('topic_distribution')) topic_distribution_pipeline = ml.Pipeline(stages=[hashing_tf, lda]) lda_model = topic_distribution_pipeline.fit(tokens_df) topic_distributions = (lda_model.transform(tokens_df).select( ['grid_square', 'topic_distribution'])) complaints_df = load_filter_format_valid_complaints( ss, 'crime_complaints_with_header.csv') complaints_df.show()
def LSA_XGBoost_model(): VOCAB_SIZE = 20000 MINDF = 3 TRAINING_ITERS = 150 tokenizer = pipeline_utils.TweetTokenizer(inputCol='text', outputCol='words') stemmer = pipeline_utils.Stemmer(inputCol=tokenizer.getOutputCol(), outputCol='cleaned_words') stopword_remover = ml.feature.StopWordsRemover( inputCol=stemmer.getOutputCol(), outputCol='stopwords_removed', stopWords=list(all_stopwords)) unigram_ifidf = TFIDF_pipeline('unigram', stopword_remover.getOutputCol(), 10000) ngrammer = ml.feature.NGram(n=1, inputCol=stemmer.getOutputCol(), outputCol='trigrams') ngram_ifidf = TFIDF_pipeline('trigram', ngrammer.getOutputCol(), 5000) assembler = ml.feature.VectorAssembler( inputCols=['unigram_features', 'trigram_features'], outputCol='features') pca = ml.feature.PCA(inputCol=assembler.getOutputCol(), k=250, outputCol='lsa_features') scaler = ml.feature.StandardScaler(inputCol=pca.getOutputCol(), outputCol='features', withMean=True) classifier = ml.classification.GBTClassifier( subsamplingRate=0.5, featureSubsetStrategy='auto') pipeline = ml.Pipeline(stages=[ tokenizer, stemmer, stopword_remover, unigram_ifidf, ngrammer, ngram_ifidf, assembler, pca, scaler, classifier ]) return pipeline, pca
def define_unigram_model(): VOCAB_SIZE = 20000 MINDF = 3 TRAINING_ITERS = 150 tokenizer = pipeline_utils.TweetTokenizer(inputCol='text', outputCol='words') stopword_remover = ml.feature.StopWordsRemover( inputCol=tokenizer.getOutputCol(), outputCol='filtered', stopWords=list(all_stopwords)) stemmer = pipeline_utils.Stemmer( inputCol=stopword_remover.getOutputCol(), outputCol='cleaned_words') counter = ml.feature.CountVectorizer(inputCol=stemmer.getOutputCol(), outputCol='counts', vocabSize=VOCAB_SIZE, minDF=MINDF) normalizer = ml.feature.Normalizer(p=1.0, inputCol=counter.getOutputCol(), outputCol='tf_normalized') df_normalize = ml.feature.IDF(inputCol=normalizer.getOutputCol(), outputCol='features') regresser = ml.classification.LogisticRegression( maxIter=TRAINING_ITERS, regParam=0.01, elasticNetParam=0.5, featuresCol='features', labelCol='label') #regresser = ml.classification.MultilayerPerceptronClassifier(maxIter=TRAINING_ITERS, layers = [3,2,1], blockSize=64,seed=1234) pipeline = ml.Pipeline(stages=[ tokenizer, stopword_remover, stemmer, counter, normalizer, df_normalize, regresser ]) return pipeline, regresser
def tfidf_review_text(df): with Timer("TF-IDF for reviewText"): df = df.select(["reviewText"]).dropna() with Timer("TF-IDF pipeline"): tokenizer = ml.feature.Tokenizer(inputCol="reviewText", outputCol="token") hasher = ml.feature.CountVectorizer(inputCol="token", outputCol="hash") idf = ml.feature.IDF(inputCol="hash", outputCol="tfidf") pipeline = ml.Pipeline(stages=[tokenizer, hasher, idf]) pipeline = pipeline.fit(df) df = pipeline.transform(df) vocab = pipeline.stages[1].vocabulary print("Vectorizer vocab size:", len(vocab)) idx2word = {idx: word for idx, word in enumerate(vocab)} with Timer("Convert TF-IDF sparseVector to str(word:value dict)"): my_udf = udf(lambda vec: sparse2dict(vec, idx2word), types.StringType()) df = df.select("reviewText", my_udf("tfidf").alias("tfidf_final")) show_df(df, 10) return df
df_labeled = df_labeled.na.drop().drop("version_idx") cols_for_ml = df_prepped01.drop("name").drop("version_idx").schema.names #pipline stages #index the label labelIndexer = mlf.StringIndexer(inputCol="Label", outputCol="Label_idx") #vectorise the input toVec = mlf.VectorAssembler(inputCols=cols_for_ml, outputCol="Features") #classify classifier = DecisionTreeClassifier(labelCol="Label_idx", featuresCol="Features", maxDepth=10, maxBins=200) #create pipline of the stages and use it to train and test pipeline = ml.Pipeline(stages=[labelIndexer, toVec, classifier]) train, test = df_labeled.randomSplit([0.7, 0.3], seed=12345) df_pip = pipeline.fit(train) predicted = df_pip.transform(test) #print result predicted.select("name", "Label_idx", "prediction", "rawPrediction", "probability").show(30, False) #function to evaluate result def evaluate(method, predicted): evaluator_acc = MulticlassClassificationEvaluator( labelCol="Label_idx", predictionCol="prediction", metricName=method) accuracy = evaluator_acc.evaluate(predicted) return accuracy
tokens_rdd = (tweets_df.rdd.map(row_to_gridsquare_tokens).reduceByKey( operator.concat)) tokens_df_schema = types.StructType([ types.StructField('grid_square', types.IntegerType()), types.StructField('tokens', types.ArrayType(types.StringType())) ]) tokens_df = ss.createDataFrame(tokens_rdd, schema=tokens_df_schema) hashing_tf = feature.HashingTF(numFeatures=(2 ^ 18) - 1, inputCol='tokens', outputCol='token_frequencies') lda = (clustering.LDA().setFeaturesCol('token_frequencies').setK( 10).setTopicDistributionCol('topic_distribution')) topic_distribution_pipeline = ml.Pipeline(stages=[hashing_tf, lda]) lda_model = topic_distribution_pipeline.fit(tokens_df) topic_distributions = lda_model.transform(tokens_df).select( ['grid_square', 'topic_distribution']) # -------------------------------------------------------------------------------------------------- # PART 2: Get complaint counts per (grid square, date). # -------------------------------------------------------------------------------------------------- complaints_df_schema = types.StructType([ types.StructField('CMPLNT_NUM', types.IntegerType(), nullable=False), types.StructField('CMPLNT_FR_DT', types.StringType()), types.StructField('CMPLNT_FR_TM', types.StringType()), types.StructField('CMPLNT_TO_DT', types.StringType()), types.StructField('CMPLNT_TO_TM', types.StringType()), types.StructField('RPT_DT', types.StringType(), nullable=False),
withMean=False, inputCol='features', outputCol='scaledFeatures') # Use PCA to reduce dimensionality of scaled vectors reducer = smf.PCA(k=10, inputCol=scaler.getOutputCol(), outputCol='selectedFeatures') # Use a classifier to generate the final predictions classifier = smc.GBTClassifier(labelCol='label', featuresCol=reducer.getOutputCol(), predictionCol='predictedLabel') # Combine all steps in a pipeline pipeline = sm.Pipeline(stages=[scaler, reducer, classifier]) # Create an evaluator which will quantify model performance # evaluator = sme.BinaryClassificationEvaluator( # labelCol='label', # rawPredictionCol='predictedLabel', # metricName='areaUnderROC' # ) eval_f1 = sme.MulticlassClassificationEvaluator(labelCol='label', predictionCol='predictedLabel', metricName='f1') # Set up a parameter grid for cross validation param_grid = smt.ParamGridBuilder().addGrid( reducer.k, [10, 20, 50, 75]).addGrid(classifier.maxDepth,