def main(*args): if len(args) != 2: print("Please provide one input and one output directories!") sys.exit(1) input_fn, output_fn = args[0],args[1] conf = SparkConf() conf.setAppName("grant") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # Load the abstract content in the test folder into spark, # clean text, tokenize the corpus, and stem the words abstract = sc.textFile(input_fn) df_abs = (abstract.map(lambda doc: text_cleaning(doc)) .filter(lambda doc: len(doc) > 0) .filter(lambda line: not line.startswith('app')) .map(lambda doc: doc.split(' ')) .map(lambda word: [x for x in word if len(x)>0]) .map(lambda word: stem(word)) .map(lambda doc: (int(doc[0]), doc[1:])) .filter(lambda doc: len(doc[1])>0) .toDF(['Id','words'])) # build the pipeline and lda model with online optimizer stop_words = StopWordsRemover(inputCol='words', outputCol='clean') stop_words.setStopWords(stop_words.loadDefaultStopWords('english')) countv = CountVectorizer(inputCol=stop_words.getOutputCol(), outputCol="tokens") idf = IDF(inputCol=countv.getOutputCol(),outputCol="features") lda = LDA(maxIter=10,k=10,optimizer='online') pipeline = Pipeline(stages=[stop_words, countv, idf, lda]) lda_model = pipeline.fit(df_abs) labels = lda_model.transform(df_abs) # identify the label as the topic with the max probability # save the label to file topic_labels = (labels.select('Id','topicDistribution') .rdd .map(lambda x: (x[0],np.argmax(x[1]))) .saveAsTextFile(os.path.join(output_fn,'labels'))) # Get the topics wordnum = 5 # choose the number of topic words vocabulary = lda_model.stages[1].vocabulary voc_bv = sc.broadcast(vocabulary) topic_df = (lda_model.stages[3].describeTopics(wordnum) .rdd .map(lambda x: (x[0],[voc_bv.value[Id] for Id in x[1]],x[2])) .saveAsTextFile(os.path.join(output_fn,'words')))
def lr_train(data): #Logistic Regression using Count Vector Features label_stringIdx = StringIndexer(inputCol="_c0", outputCol="label") lsmodel=label_stringIdx.fit(data) data=lsmodel.transform(data) (trainingData, testData) = data.randomSplit([0.9, 0.1], seed=100) countVectors = CountVectorizer(inputCol="filtered", outputCol="cfeatures", vocabSize=10000, minDF=5) '''hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000) idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features",minDocFreq=5)''' lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0,featuresCol=countVectors.getOutputCol(), labelCol="label") pipeline = Pipeline(stages=[countVectors,lr]) pipelineFit = pipeline.fit(trainingData) predictions = pipelineFit.transform(testData) #predictions.show(5) evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction") #evaluator.evaluate(predictions) return (evaluator.evaluate(predictions),lsmodel.labels,pipelineFit)
def lr_train_tvs(data): #Logistic Regression using Count Vector Features label_stringIdx = StringIndexer(inputCol="_c0", outputCol="label") lsmodel=label_stringIdx.fit(data) data=lsmodel.transform(data) #(trainingData, testData) = data.randomSplit([0.9, 0.1], seed=100) countVectors = CountVectorizer(inputCol="filtered", outputCol="cfeatures", vocabSize=10000, minDF=5) '''hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000) idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features",minDocFreq=5)''' evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction") lr = LogisticRegression(regParam=0.3, elasticNetParam=0,featuresCol=countVectors.getOutputCol(), labelCol="label") pipeline = Pipeline(stages=[countVectors,lr]) grid = ParamGridBuilder().addGrid(lr.maxIter, [10,15,20]).build() crossval = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=0.9) cvmodel=crossval.fit(data) return (evaluator.evaluate(cvmodel.transform(data)),lsmodel.labels,cvmodel)
def get_pipeline(): # Hard Coded Labels (original texts only): auth_hard_lbl = AuthorLabeler(inputCol='author', outputCol='author_label') ttl_hard_lbl = TitleLabeler(inputCol='title', outputCol='title_label') # Labels author_labeler = StringIndexer(inputCol="author", outputCol="author_id") title_labeler = StringIndexer(inputCol="title", outputCol="title_id") vector_ider = VectorAssembler( inputCols=["author_id", "title_id", "excerpt_number"], outputCol="id_vector") tokenizer = SpacyTokenizer(inputCol='excerpt', outputCol='words') # TF-IDF countvec = CountVectorizer(inputCol=tokenizer.getOutputCol() , outputCol='termfreq') idf = IDF(inputCol=countvec.getOutputCol(), outputCol='tfidf') # Word2Vec word2vec = Word2Vec(vectorSize=250, minCount=2 , inputCol=tokenizer.getOutputCol(), outputCol="w2v") w2v_2d = Word2Vec(vectorSize=2, minCount=2 , inputCol=tokenizer.getOutputCol(), outputCol="w2v_2d") # TODO: Include Metadata # char_count = # word_count = # sent_count = # para_count = # TODO: Play with n-grams # NGram(n=2, inputCol=tokenizer.getOutputCol(), outputCol="2_gram") # NGram(n=3, inputCol=tokenizer.getOutputCol(), outputCol="3_gram") # NGram(n=4, inputCol=tokenizer.getOutputCol(), outputCol="4_gram") # NGram(n=5, inputCol=tokenizer.getOutputCol(), outputCol="5_gram") pipeline = Pipeline(stages=[author_labeler, title_labeler, vector_ider, tokenizer, countvec, idf, word2vec, w2v_2d]) return pipeline
def benchmark_body_pipeline(cleaned_dataframe, stopwordlist=None): """NLP pipeline. Tokenizes, removes stopwords, and computes TF-IDF Returns transformed data as 'features' and the vocabulary of words.""" tokenizer = Tokenizer(inputCol="Text", outputCol="Text_tokens") if stopwordlist: stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="Text_tokens_stopped", stopWords=stopwordlist) else: stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="Text_tokens_stopped") count_vect = CountVectorizer(inputCol=stop_remover.getOutputCol(), outputCol="Text_counts_raw") idf = IDF(inputCol=count_vect.getOutputCol(), outputCol="features") pipeline = Pipeline(stages=[tokenizer, stop_remover, count_vect, idf]) model = pipeline.fit(cleaned_dataframe) featurized_data = model.transform(cleaned_dataframe) return featurized_data, model.stages[-2].vocabulary
# Import json objects from tar file opinion_df = import_dataframe(spark, 'opinion') docket_df = import_dataframe(spark, 'docket') cluster_df = import_dataframe(spark, 'cluster') # Setup pipeline for adding ML features - tokens, stems, n-grams, tf, tfidf, word2vec # tokenizer = Tokenizer(inputCol='parsed_text', outputCol='tokens') tokenizer = RegexTokenizer(inputCol="parsed_text", outputCol="raw_tokens", pattern="\\W", minTokenLength=3) remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='tokens_stop') stemmer = Stemming_Transformer(inputCol=remover.getOutputCol(), outputCol='tokens') bigram = NGram(inputCol=stemmer.getOutputCol(), outputCol='bigrams', n=2) trigram = NGram(inputCol=stemmer.getOutputCol(), outputCol='trigrams', n=3) cv = CountVectorizer(inputCol=stemmer.getOutputCol(), outputCol='token_countvector', minDF=10.0) idf = IDF(inputCol=cv.getOutputCol(), outputCol='token_idf', minDocFreq=10) w2v_2d = Word2Vec(vectorSize=2, minCount=2, inputCol=stemmer.getOutputCol(), outputCol='word2vec_2d') w2v_large = Word2Vec(vectorSize=250, minCount=2, inputCol=stemmer.getOutputCol(), outputCol='word2vec_large') pipe = Pipeline(stages=[tokenizer, remover, stemmer, cv, idf, w2v_2d, w2v_large]) # Use the pipeline to fit a model model = pipe.fit(opinion_df) # Use the model to transform the data df_transformed = model.transform(opinion_df) # retrieve top 10 number of words for the document, assumes existence of 'row' containg one row from the dataframe np.array(opinion_cv_model.vocabulary)[row['token_idf'].indices[np.argsort(row['token_idf'].values)]][:-11:-1] # save and retrieve dataframe
outputCol='body') filterer = Filterer(key='subreddit', val='body', inputCol='subreddit', outputCol='body', minlength=args.minlength) tokenizer = RegexTokenizer(inputCol=cleaner.getOutputCol(), outputCol="tokens", pattern="\\W") remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="swr_tokens") cv = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="tf", minDF=args.mindf, vocabSize=args.vocabsize) idf = IDF(inputCol=cv.getOutputCol(), outputCol="tfidf") topkwords = TopKWords(inputCol=idf.getOutputCol(), outputCol='top_words', nwords=args.nwords) cos_similarity = CosineSimilarity(inputCol='subreddit', outputCol='norm', spark=spark) topksubreddits = TopKSubreddits(inputCol=cos_similarity.getOutputCol(), outputCol='top_subreddits', nsubreddits=args.nsubreddits) pipeline = Pipeline(stages=[ extractor, cleaner, filterer, tokenizer, remover, cv, idf, topkwords, cos_similarity, topksubreddits ])
# list of stopwords to be removed from the posts StopWords = list(set(stopwords.words('english'))) labelIndexer = StringIndexer(inputCol="tags", outputCol="label").fit(train) bs_text_extractor = BsTextExtractor(inputCol="post", outputCol="untagged_post") RegexTokenizer = RegexTokenizer(inputCol=bs_text_extractor.getOutputCol(), outputCol="words", pattern="[^0-9a-z#+_]+") StopwordRemover = StopWordsRemover( inputCol=RegexTokenizer.getOutputCol(), outputCol="filtered_words").setStopWords(StopWords) CountVectorizer = CountVectorizer(inputCol=StopwordRemover.getOutputCol(), outputCol="countFeatures", minDF=5) idf = IDF(inputCol=CountVectorizer.getOutputCol(), outputCol="features") rf = RandomForestClassifier(labelCol="label", featuresCol=idf.getOutputCol(), numTrees=100, maxDepth=4) idx_2_string = IndexToString(inputCol="prediction", outputCol="predictedValue") idx_2_string.setLabels(labelIndexer.labels) # creating the pipeline pipeline = Pipeline(stages=[ labelIndexer, bs_text_extractor, RegexTokenizer, StopwordRemover, CountVectorizer, idf, rf, idx_2_string ]) # fitting the model model = pipeline.fit(train)
def update_text_with_key_ngrams(df, n, seed=42, outputCol="ngram_text", pattern=r"(?!(?<='))\w+"): def build_text(words): # Wandle bag of words in sentences um und schaue in jedem der # sentences ob # eines der key_bigrams in ihm vorkommt # bspw. bag of words = ["hi", "i", "ralf"] und key_bigram = "i ralf" --> # sentence = ["hi i ralf"] und key_bigram kommt drin vor # Wenn bigram vorkommt, dann ersetze die zwei Wörter im Satz mit der # underscore version des bigrams ("i_ralf") sentence = ' '.join(words) for ngram in key_ngrams: if ngram in sentence: sentence = sentence.replace(ngram, ngram.replace(" ", "_")) return sentence outputs = { "tokenizer": "words", "ngram": "ngrams", "cv": "tf", "idf": "tf_idf", "build_text_udf": outputCol } # Build pipeline tokenizer = RegexTokenizer(inputCol="text", outputCol=outputs["tokenizer"], pattern=pattern, gaps=False) ngram = NGram(n=n, inputCol=tokenizer.getOutputCol(), outputCol=outputs["ngram"]) cv = CountVectorizer(inputCol=ngram.getOutputCol(), outputCol=outputs["cv"]) idf = IDF(inputCol=cv.getOutputCol(), outputCol=outputs["idf"]) pipe = Pipeline(stages=[ tokenizer, # transform ngram, # transform cv, # fit_transform idf # fit ]) print("\t Computing tf_idf matrix for {}-grams...".format(n)) pipe_model = pipe.fit(df) # calls transform on tokenizer & ngram, # fit_transform on cv and fit on idf vocabulary = np.array(pipe_model.stages[2].vocabulary) print("\t\t vocabulary size: {}".format(len(vocabulary))) df = pipe_model.transform(df) # train test split train, _ = df.randomSplit([0.8, 0.2], seed=seed) train.persist(StorageLevel.MEMORY_AND_DISK) # fit linear SVM svc = LinearSVC(maxIter=100, regParam=0.1, featuresCol="tf_idf") print("\t Estimating key {}-grams with SVC...".format(n)) svc_model = svc.fit(train) # Wähle die ngrams mit den schlechtesten/besten weights print("\t Update text with key {}-grams...".format(n)) coeffs = svc_model.coefficients.toArray() key_ngrams = get_n_extremes_of_a_in_b(coeffs, vocabulary, 50) build_text_udf = F.udf(build_text) df = df.withColumn(outputs["build_text_udf"], build_text_udf( F.col(tokenizer.getOutputCol()))) print() return df
seed=42, outputCol=outputCol, pattern=pattern) print("\n") ## PREDICT LABEL BASED ON TF-IDF OF UPDATED TEXT print("Computing TF-IDF matrix for updated text...") tokenizer = RegexTokenizer(inputCol=outputCol, outputCol="words_with_ngrams", pattern=pattern, gaps=False) stop_words_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered_words") cv = CountVectorizer(inputCol=stop_words_remover.getOutputCol(), outputCol="final_tf") idf = IDF(inputCol=cv.getOutputCol(), outputCol="final_tf_idf") pipe = Pipeline(stages=[ tokenizer, stop_words_remover, cv, idf ]) reviews_mini = pipe.fit(reviews_mini).transform(reviews_mini) ## Train test split train, test = reviews_mini.randomSplit([0.8, 0.2], seed=seed) train.persist(StorageLevel.MEMORY_AND_DISK)
para_train = ( articles_by_paragraph.select('article_id', 'p_index', 'paragraph') # select unique identifiers .where(col('paragraph').isNotNull()) # ignore blank paragraphs .withColumn('paragraph', clean_udf('paragraph')) # clean the text .withColumn('paragraph', split(col('paragraph'), ' ')) # split on blank space to tokenize words .withColumnRenamed('paragraph', 'text') # rename column to text for pipeline ) para_train.show(5) tf = CountVectorizer(inputCol='text', outputCol='tf_result', minDF=0.05, maxDF=0.9) idf = IDF(inputCol=tf.getOutputCol(), outputCol='features') lda = LDA(k=20, maxIter=10) paragraph_pipe = Pipeline(stages=[tf, idf, lda]) para_model = paragraph_pipe.fit(para_train) # models will not overwrite existing ones of the same name """import shutil, os if os.path.exists("../models/articles_LDA"): shutil.rmtree("../models/articles_LDA")""" para_model.save("../models/articles_LDA_")
from pyspark.ml.clustering import LDA spark = SparkSession.builder.getOrCreate() data = pd.read_csv('https://raw.githubusercontent.com/DaiZack/MLdatasets/master/imdb500.csv') df = spark.createDataFrame(data) textCol = 'review' selfstopwords = ['br'] numOfTopics = 10 numOfKeywords = 5 tokenizer = RegexTokenizer(inputCol=textCol, outputCol='token', pattern='\\W+') stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='clean0') stopwords1 = StopWordsRemover(inputCol=stopwords.getOutputCol(), stopWords=selfstopwords,outputCol='clean') cv = CountVectorizer(inputCol=stopwords1.getOutputCol(), outputCol='cv') idf = IDF(inputCol=cv.getOutputCol(), outputCol='idf') lda = LDA(featuresCol=idf.getOutputCol(), k=numOfTopics, maxIter=10) pipe1 = Pipeline(stages=[tokenizer, stopwords,stopwords1,cv,idf, lda]) model = pipe1.fit(df) output = model.transform(df) def topicsTerms(vocab, termindices, leng=None): if not leng: return [voca[t] for t in termindices] return [vocab[t] for t in termindices][:leng] def topicsTerm_udf(vocab, leng=None): return udf(lambda x: topicsTerms(vocab,x, leng))
# %% dataSet = dataSet.withColumn('class', dataSet['class'].cast(IntegerType())) dataSet = dataSet.select('class', 'cleanReview').withColumnRenamed( 'cleanReview', 'reviews') # %% trainDF, testDF = dataSet.randomSplit([0.8, 0.2]) trainDF.show() testDF.show() # %% tokenizer = Tokenizer(inputCol="reviews", outputCol="tokens") countVector = CountVectorizer(inputCol=tokenizer.getOutputCol(), outputCol='features') idf = IDF(inputCol=countVector.getOutputCol(), outputCol='idf') pipeline = Pipeline(stages=[tokenizer, countVector, idf]) pipelineModel = pipeline.fit(trainDF) # %% pTrainDF = pipelineModel.transform(trainDF) pTestDF = pipelineModel.transform(testDF) # %% evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="f1") lr = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol='class') lrModel = lr.fit(pTrainDF) predictionsLR = lrModel.transform(pTestDF) evaluator.evaluate(predictionsLR)
sampled.groupby('label').count().toPandas() # # Data Ingestion and Vectorization # In[18]: #Tokennize the TrainData - sparse the URL string into words regexTokenizer = RegexTokenizer(inputCol="url", outputCol="Words", pattern="\\W") #CountVectorizer converts the the words into feature vectors - Thi is used as it gives better results countVectors = CountVectorizer(inputCol=regexTokenizer.getOutputCol(), outputCol="rawfeatures", vocabSize=10000, minDF=5) # idf = IDF(inputCol=countVectors.getOutputCol(), outputCol="features") #create the pipline pipeline = Pipeline(stages=[regexTokenizer, countVectors, idf ]) # Fit the pipeline to training documents. # Pass 'sampled' in the param to set Balanced datasets pipelineFit = pipeline.fit(sampled) #Transform the pipeline to dataset # Pass 'sampled' in the param to set Balanced datasets dataset = pipelineFit.transform(sampled) #randomly split the dataset to traning and testing 80%, 20% respectively (trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer from pyspark.ml.regression import LinearRegression, GBTRegressor from pyspark.ml import Pipeline tokenizer = Tokenizer(inputCol='reviewText', outputCol='reviewWords') stop_words_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='reviewWordsWithoutTrash') vectorizer = CountVectorizer(inputCol=stop_words_remover.getOutputCol(), outputCol="word_vector", minDF=150) lr = LinearRegression(featuresCol=vectorizer.getOutputCol(), labelCol='overall') pipeline = Pipeline(stages=[tokenizer, stop_words_remover, vectorizer, lr])
remover = StopWordsRemover(inputCol='words', outputCol='filtered_words') text = remover.transform(text) text.show(5) ngramer = NGram(n=2, inputCol='filtered_words', outputCol='ngrams') text = ngramer.transform(text) text.show(5) count_vec = CountVectorizer(inputCol=ngramer.getOutputCol(), outputCol='ft_features') count_vec_model = count_vec.fit(text) vocab = count_vec_model.vocabulary text = count_vec_model.transform(text) text.show(5) idf = IDF(inputCol=count_vec.getOutputCol(), outputCol='features') text = idf.fit(text).transform(text) lda = LDA(featuresCol=idf.getOutputCol(), k=5, maxIter=10) lda_model = lda.fit(text) topics = lda_model.describeTopics() # topics_words = topics.rdd.map(lambda x: x['termIndices']).map(lambda x:[vocab[i] for i in x]).collect() get_topics_words = F.udf(lambda x: [vocab[i] for i in x], ArrayType(StringType())) topics = topics.withColumn('topic_words', get_topics_words(F.col('termIndices'))) topics.show() text = lda_model.transform(text) text.show(5)