def countVectorizer(infoData): colName = infoData.get(pc.COLMTOENCODE) dataset = infoData.get(pc.DATASET) encodedColm = infoData.get(pc.ENCODEDCOLM) originalColmName = infoData.get(pc.ORIGINALCOLMNAME) oneHotEncoderPathMapping = infoData.get(pc.ONEHOTENCODERPATHMAPPING) storageLocation = infoData.get(pc.STORAGELOCATION) countVectorizer = CountVectorizer(inputCol=colName, outputCol=encodedColm).fit(dataset) '''oneHotEncoderPath = storageLocation + modelId.upper() + PredictiveConstants.ONEHOTENCODED.upper() + PredictiveConstants.PARQUETEXTENSION oneHotEncoder.write().overwrite().save(oneHotEncoderPath) oneHotEncoderPathMapping.update({ PredictiveConstants.ONEHOTENCODED: oneHotEncoderPath })''' oneHotEncoderPath = storageLocation + pc.ONEHOTENCODED_.upper( ) + originalColmName.upper() + pc.PARQUETEXTENSION countVectorizer.write().overwrite().save(oneHotEncoderPath) oneHotEncoderPathMapping.update({originalColmName: oneHotEncoderPath}) dataset = countVectorizer.transform(dataset) infoData.update({ pc.ONEHOTENCODERPATHMAPPING: oneHotEncoderPathMapping, pc.DATASET: dataset }) return infoData
def oneHotEncodeData(self, sentimentInfoData): colName = sentimentInfoData.get(pc.COLMTOENCODE) dataset = sentimentInfoData.get(pc.DATASET) vectorizedFeaturescolmName = "features" # temp fix for testing only dataset.drop(vectorizedFeaturescolmName) oneHotEncodedColName = pc.ONEHOTENCODED_ + colName countVectorizer = CountVectorizer( inputCol=pc.DMXSTOPWORDS, outputCol=oneHotEncodedColName).fit(dataset) '''oneHotEncoderPath = storageLocation + modelId.upper() + PredictiveConstants.ONEHOTENCODED.upper() + PredictiveConstants.PARQUETEXTENSION oneHotEncoder.write().overwrite().save(oneHotEncoderPath) oneHotEncoderPathMapping.update({ PredictiveConstants.ONEHOTENCODED: oneHotEncoderPath })''' dataset = countVectorizer.transform(dataset) # need to store the path of count vectorizer to use at the time of performing sentiment analysis. '''create feature colm from encoded colm''' featureassembler = VectorAssembler( inputCols=[oneHotEncodedColName], outputCol=vectorizedFeaturescolmName, handleInvalid="skip") dataset = featureassembler.transform(dataset) sentimentInfoData.update({ pc.FEATURECOLUMN: vectorizedFeaturescolmName, pc.DATASET: dataset }) return sentimentInfoData
def train(self): self.__prepare() spark = SparkSession\ .builder\ .appName("Kursach")\ .getOrCreate() input_file = spark.sparkContext.textFile('./w2v.txt') # print(input_file.collect()) prepared = input_file.map(lambda x: ([x])) df = prepared.toDF() prepared_df = df.selectExpr('_1 as text') tokenizer = Tokenizer(inputCol='text', outputCol='words') words = tokenizer.transform(prepared_df) stop_words = StopWordsRemover.loadDefaultStopWords('russian') remover = StopWordsRemover(inputCol='words', outputCol='filtered', stopWords=stop_words) filtered = remover.transform(words) # print(stop_words) # filtered.show() # words.select('words').show(truncate=False, vertical=True) # filtered.select('filtered').show(truncate=False, vertical=True) vectorizer = CountVectorizer(inputCol='filtered', outputCol='raw_features').fit(filtered) featurized_data = vectorizer.transform(filtered) featurized_data.cache() vocabulary = vectorizer.vocabulary # featurized_data.show() # featurized_data.select('raw_features').show(truncate=False, vertical=True) # print(vocabulary) idf = IDF(inputCol='raw_features', outputCol='features') idf_model = idf.fit(featurized_data) rescaled_data = idf_model.transform(featurized_data) self.__word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol='words', outputCol='result') self.__model = self.__word2Vec.fit(filtered) w2v_df = self.__model.transform(words) w2v_df.show() spark.stop()
def LDAThis(sc, RDD, minFreq, numTopics, maxIter, wordsPerTopic): ''' Arguments: sc: A SparkContext Object RDD: An RDD with rows as tokenized sentences minFreq: Minimum document frequency for CountVectorizer numTopics: Number of Topics maxIter: Max number of iterations for LDA train wordsPerTopic: Number of words to show per topic topWords: Number of words to show per topic Requirements sqlContext = SQLContext(sc) <- must be defined outside function ''' StopWords = stopwords.words("english") sqlContext = SQLContext(sc) # Structure Data idRDD = RDD.map( lambda words: [x for x in words if x.isalpha() and x not in StopWords ]).filter(lambda x: len(x) > 2).zipWithIndex() idDF = sqlContext.createDataFrame(idRDD, ["tokens", 'index']) # Term Frequency CVecModel = CountVectorizer(inputCol="tokens", outputCol="rawFeatures", vocabSize=5000, minDF=minFreq).fit(idDF) resultCVec = CVecModel.transform(idDF) vocabArray = CVecModel.vocabulary #IDF idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(resultCVec) resultTFIDF = idfModel.transform(resultCVec) # LDA resultLDA = LDA.train(resultTFIDF.select( 'index', 'features').rdd.mapValues(Vectors.fromML).map(list), k=numTopics, maxIterations=maxIter) topicIndices = sc.parallelize( resultLDA.describeTopics(maxTermsPerTopic=wordsPerTopic)) topicsFinal = topicIndices.map(lambda topic: render_topics( topic, wordsPerTopic, vocabArray)).collect() # Show Topics for topic in range(len(topicsFinal)): print("Topic" + str(topic) + ":") for term in topicsFinal[topic]: print(term) print('\n') return resultLDA
def GetFeatures(data): ''' # TF-IDF Score hashingTF = HashingTF(inputCol="Filtered", outputCol="rawFeatures", numFeatures=3000) idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2.0) pipeline = Pipeline(stages=[hashingTF, idf]) dataset = pipeline.fit(data).transform(data) dataset.show(5) ''' # Term Frequency # minDF: Specifies the minimum number of different documents a term must appear in # to be included in the vocabulary model = CountVectorizer(inputCol="Filtered", outputCol="features", minDF=0.03).fit(data) df = model.transform(data) print("========= Finish Getting Features for Training =========") return df, model.vocabulary
def getCountVector(final_df): ''' This function accepts as input a dataframe with a column named 'data' containing each document as a row. This will be converted to a countvector and the ouput column will be named 'indexedFeatures'. It returns the countvector model and the original dataframe with additional column 'indexedFeatures'. Arg1 : dataframe to compute the count vector ''' #getting the countvector print('************* inside the count vector ****************') print('************* inside the count vector ****************') print('************* inside the count vector ****************') cv = CountVectorizer(inputCol="data", outputCol="indexedFeatures").fit(final_df) countVector_df = cv.transform(final_df) print('************* returning the count vector ****************') print('************* returning the count vector ****************') print('************* returning the count vector ****************') return countVector_df, cv
def main(): set_pandas_options() app_name = "Case Study 2: Email Analytics" conf = SparkConf().setAppName(app_name) conf = (conf.setMaster('local[*]').set( "spark.driver.host", "localhost").set('spark.executor.memory', '4G').set('spark.driver.memory', '8G').set('spark.driver.maxResultSize', '10G')) sc = SparkContext(conf=conf) spark = SparkSession(sc) log4jLogger = sc._jvm.org.apache.log4j LOGGER = log4jLogger.LogManager.getLogger(__name__) LOGGER.info("pyspark script logger initialized") # 1 Load data into Spark DataFrame LOG = get_hdfs_filepath('*/*/*') # read text file log_txt_df = sc.wholeTextFiles(LOG).filter(lambda line: line != '').toDF() # Convert strings to columns udf1 = udf(to_utc_timestamp, TimestampType()) df = log_txt_df df = df.select(df._2.alias('line')) udf1 = udf(to_utc_timestamp, TimestampType()) temp = df.select( regexp_extract(col('line'), r'Message-ID:\s<.*>', 0).alias('Message_ID'), regexp_extract( col('line'), r'\d{1,2}\s\w{3}\s\d{4}\s\d{2}:\d{2}:\d{2}\s(\+|\-)\d{4}(.*)', 0).alias("Date"), regexp_extract(col('line'), r'From:\s(.*)', 0).alias("From"), regexp_extract( col('line'), r"To:\s(.+)((?:\n|\r\n?)((?:(?:\n|\r\n?).+)+)){0,}(\S+@\S+)(?:\n|\r\n?)Subject:\s", 0).alias("To"), regexp_extract( col('line'), r"Subject:\s(.+)((?:\n|\r\n?)((?:(?:\n|\r\n?).+)+)){0,}", 1).alias("Subject"), regexp_extract( col('line'), r"Cc:\s(.+)((?:\n|\r\n?)((?:(?:\n|\r\n?).+)+)){0,}(?:\n|\r\n?)Mime-Version:\s", 0).alias("Cc"), regexp_extract(col('line'), r'Mime-Version:\s(.+)', 1).alias("Mime_Version"), regexp_extract(col('line'), r'Content-Type:\s(.*)', 1).alias("Content_Type"), regexp_extract(col('line'), r"Content-Transfer-Encoding:\s(.+)", 1).alias("Content_Transfer_Encoding"), regexp_extract(col('line'), r"X-From:\s(.*)(?:\n|\r\n?)X-To:\s", 0).alias("X_From"), regexp_extract(col('line'), r'X-To:\s(.*)(?:\n|\r\n?)X-cc:\s', 0).alias("X_To"), regexp_extract(col('line'), r'X-cc:\s(.*)(?:\n|\r\n?)X-bcc:\s', 0).alias("X_cc"), regexp_extract(col('line'), r'X-bcc:\s(.*)(?:\n|\r\n?)X-Folder:\s', 0).alias("X_bcc"), regexp_extract(col('line'), r'X-Folder:\s(.*)(?:\n|\r\n?)X-Origin:\s', 0).alias("X_Folder"), regexp_extract(col('line'), r"X-Origin:\s(.*)(?:\n|\r\n?)X-FileName:\s", 0).alias("X_Origin"), regexp_extract(col('line'), r"X-FileName:\s(.*)", 0).alias("X_FileName"), regexp_extract( col('line'), r"X-FileName:\s(.*)((?:\n|\r\n?){1,}(.*)){1,}((?:(?:\n|\r\n?).+)+)", 0).alias("FYI")) #temp.cache() temp1 = temp.select( expr("substring(Message_ID, 14, length(Message_ID)-14)").alias( "Message_ID"), 'Date', udf1('Date').alias('UTC_timestamp'), expr("substring(From, 7, length(From)-6)").alias("From"), expr("substring(To, 5, length(To)-15)").alias("To"), "Subject", expr("substring(Cc, 5, length(Cc)-20)").alias("Cc"), "Mime_Version", "Content_Type", 'Content_Transfer_Encoding', expr("substring(X_From, 9, length(X_From)-16)").alias("X_From"), expr("substring(X_To, 7, length(X_To)-14)").alias("X_To"), expr("substring(X_cc, 7, length(X_cc)-15)").alias("X_cc"), expr("substring(X_bcc, 8, length(X_bcc)-19)").alias("X_bcc"), expr("substring(X_Folder, 11, length(X_Folder)-22)").alias("X_Folder"), expr("substring(X_Origin, 11, length(X_Origin)-24)").alias("X_Origin"), expr("substring(X_FileName, 13, length(X_FileName)-15)").alias( "X_FileName"), regexp_replace( col('FYI'), r"(X-FileName:\s(.*)(?:\n|\r\n?){1,})|(-*Original Message-*(.*)((?:\n|\r\n?){1,}(.*)){0,}((?:(?:\n|\r\n?).+)+))", '').alias('FYI')) #temp1.cache() result = temp1.select( "Message_ID", 'Date', 'UTC_timestamp', "From", regexp_replace(col('To'), r"\r\n\t", "").alias("To"), "Subject", regexp_replace(col('Cc'), r"\r\n\t", "").alias("Cc"), "Mime_Version", "Content_Type", 'Content_Transfer_Encoding', "X_From", "X_To", "X_cc", "X_bcc", "X_Folder", "X_Origin", "X_FileName", regexp_replace(col('FYI'), r"(^\s{1,})|(\n{2,})", '').alias('FYI')) zz = result.limit(5).toPandas() LOGGER.info( "\n\n1.\tLoad data into Spark DataFrame\tDone!\n\n{}\n".format(zz)) # 2 Display the top 10 high-frequency users based on weekly numbers of emails sent df1 = result freq = df1.groupBy('From').agg( (count('UTC_timestamp') / ((max(unix_timestamp(col('UTC_timestamp'))) - min(unix_timestamp(col('UTC_timestamp')))) / 604800)).alias('rate_per_week')).orderBy("rate_per_week", ascending=False) zz = freq.limit(10).toPandas() LOGGER.info( "\n\n2.\tDisplay the top 10 high-frequency users based on weekly numbers of emails sent\tDone!\n\n{}\n" .format(zz)) # 3a Extract top 20 keywords from the subject text for the top 10 high-frequency users top = freq.limit(10) top_subj = df1.join(top, df1["From"] == top["From"], "inner").select(df1['From'], df1['Subject']) top_texts = top_subj.groupBy("From").agg( concat_ws(" ", collect_list("Subject")).alias("texts")) top_texts = top_texts.select('texts').agg( concat_ws(" ", collect_list("texts")).alias("subjects")) # Extract word from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer().setInputCol("subjects").setOutputCol("words") transformed = tokenizer.transform(top_texts) # Extend the stop words dictionary by adding your own stop words such as - # Remove stopwords # custom stopwords stopwords = StopWordsRemover().getStopWords() + ["-", "re:", "", "fw"] remover = StopWordsRemover().setStopWords(stopwords).setInputCol( "words").setOutputCol("filtered") cleaned = remover.transform(transformed) # Extract top 20 keywords by identifying removing the common stop words # Generate features from pyspark.ml.feature import CountVectorizer, CountVectorizerModel cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol( "features").fit(cleaned) featured = cvmodel.transform(cleaned) counts = featured.select('features').collect() a = cvmodel.vocabulary b = counts[0]['features'].values d = {'words': a, 'counts': b} df = pd.DataFrame(d) zz = df.head(20) LOGGER.info( "\n\n3a.\tExtract top 20 keywords from the subject text for the top 10 high-frequency users\tDone!\n\n{}\n" .format(zz)) # 3b Extract top 20 keywords from the subject text for the non-high frequency users w = Window().orderBy(lit('A')) bottom = freq.orderBy("rate_per_week", ascending=False).withColumn("row_num", row_number().over(w)) bottom = bottom.where(col('row_num') > 10).select('From', 'rate_per_week') bottom_subj = df1.join(bottom, df1["From"] == bottom["From"], "inner").select(df1["From"], df1["Subject"]) bottom_texts = bottom_subj.groupBy("From").agg( concat_ws(" ", collect_list("Subject")).alias("texts")) bottom_texts = bottom_texts.select('texts').agg( concat_ws(" ", collect_list("texts")).alias("subjects")) # Extract word tokenizer = Tokenizer().setInputCol("subjects").setOutputCol("words") transformed = tokenizer.transform(bottom_texts) # Remove stopwords # custom stopwords stopwords = StopWordsRemover().getStopWords() + [ "-", "re:", "fw:", "", "&" ] remover = StopWordsRemover().setStopWords(stopwords).setInputCol( "words").setOutputCol("filtered") cleaned = remover.transform(transformed) # Generate features cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol( "features").fit(cleaned) featured = cvmodel.transform(cleaned) counts = featured.select('features').collect() a = cvmodel.vocabulary b = counts[0]['features'].values d = {'words': a, 'counts': b} df = pd.DataFrame(d) zz = df.head(20) LOGGER.info( "\n\n3b.\tExtract top 20 keywords from the subject text for the non-high frequency users\tDone!\n\n{}\n" .format(zz)) # 6 Introduce a new column label to identify new, replied, and forwarded messages df = result def to_label(sbj): l1 = "RE" if sbj.startswith("RE:") else ( "FW" if sbj.startswith("FW:") else 'NEW') return l1 udf2 = udf(to_label, StringType()) df_with_label = df.withColumn('label', udf2("Subject")) zz = df_with_label.limit(5).toPandas() LOGGER.info( "\n\n6.\tIntroduce a new column label to identify new, replied, and forwarded messages\tDone!\n\n{}\n" .format(zz)) # 7 Get the trend of the over mail activity using the pivot table from spark itself pivotDF = df_with_label.groupBy( year("UTC_timestamp").alias('year'), month("UTC_timestamp").alias('month')).pivot("label").count().orderBy( "year", "month") zz = pivotDF.na.fill(0).toPandas() LOGGER.info( "\n\n7.\tGet the trend of the over mail activity using the pivot table from spark itself\tDone!\n\n{}\n" .format(zz)) # 8 Use k-means clustering to create 4 clusters from the extracted keywords raw = result.select("Message_ID", "From", "Subject") # Extract word from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer().setInputCol("Subject").setOutputCol("words") transformed = tokenizer.transform(raw) # Remove stopwords # custom stopwords stopwords = StopWordsRemover().getStopWords() + [ "-", "re:", "fw:", "", "&" ] remover = StopWordsRemover().setStopWords(stopwords).setInputCol( "words").setOutputCol("filtered") cleaned = remover.transform(transformed) cleaned = cleaned.select("Message_ID", "words", "filtered") # Generate features from pyspark.ml.feature import CountVectorizer, CountVectorizerModel cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol( "features").fit(cleaned) featured = cvmodel.transform(cleaned) kmeans = KMeans(k=4, seed=1) # 4 clusters here model = kmeans.fit(featured.select('features')) transformed = model.transform(featured) zz = transformed.limit(5).toPandas() LOGGER.info( "\n\n8.\tUse k-means clustering to create 4 clusters from the extracted keywords\tDone!\n\n{}\n" .format(zz)) # 9 Use LDA to generate 4 topics from the extracted keywords LOGGER.info( "\n\n9.\tUse LDA to generate 4 topics from the extracted keywords\tDone!\n\n{}\n{}\n{}\n{}\n" .format(get_topic(0, transformed), get_topic(1, transformed), get_topic(2, transformed), get_topic(3, transformed)))
def main(): conf = SparkConf().setAppName("Program Number 1") sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") # creates Spark Session spark = SparkSession.builder.appName("Program Number 1").getOrCreate() # tweets folder address on HDFS server - ignore files with .tmp extensions (Flume active files). inputpath = "hdfs://hdfs input path" spark.conf.set("spark.sql.shuffle.partitions", 1) # get the raw tweets from HDFS raw_tweets = spark.read.format("json").option( "inferScehma", "true").option("mode", "dropMalformed").load(inputpath) # get the tweet text from the raw data. text is transformed to lower case. Deletes re-tweets. and finally include an index for each tweet tweets = raw_tweets.select( functions.lower(functions.col("text"))).withColumnRenamed( "lower(text)", "text").distinct().withColumn( "id", functions.monotonically_increasing_id()) # Create a tokenizer that Filter away tokens with length < 4, and get rid of symbols like $,#,... tokenizer = RegexTokenizer().setPattern("[\\W_]+").setMinTokenLength( 4).setInputCol("text").setOutputCol("tokens") # Tokenize tweets tokenized_tweets = tokenizer.transform(tweets) remover = StopWordsRemover().setInputCol("tokens").setOutputCol("cleaned") # remove stopwords cleaned_tweets = remover.transform(tokenized_tweets) # create a vector of words that at least appeared in two different tweets, and set maximum vocab size to 20000. vectorizer = CountVectorizer().setInputCol("cleaned").setOutputCol( "features").setVocabSize(20000).setMinDF(2).fit(cleaned_tweets) wordVectors = vectorizer.transform(cleaned_tweets).select("id", "features") # LDA # create Latent Dirichlet Allocation model and run it on our data with 25 iteration and 5 topics lda = LDA(k=5, maxIter=25) # fit the model on data ldaModel = lda.fit(wordVectors) # create topics based on LDA lda_topics = ldaModel.describeTopics() # show LDA topics # ______________________________________________________________________________________________________________ # LSA clean_tweets_list = [] tweet_list = [] # for creating the document term matrix for the LSIModel as input # this is needed as LSI needs tuples of (vocabulary_index, frequency) form for tweet_row in wordVectors.select('features').collect(): tweet_list.clear() # reading the SparseVector of 'features' column (hence the 0 index) and zipping them to a list # idx = vocabulary_index, val=frequency of that word in that tweet for idx, val in zip(tweet_row[0].indices, tweet_row[0].values): # converting the frequency from float to integer tweet_list.append((idx, int(val))) clean_tweets_list.append(tweet_list[:]) # calling the LSIModel and passing the number of topics as 5 lsa_model = LsiModel(clean_tweets_list, num_topics=5) # show LSA topics # ______________________________________________________________________________________________________________ # #Comparison # get the weights and indices of words from LDA topics in format of List[list[]] lda_wordIndices = [row['termIndices'] for row in lda_topics.collect()] lda_wordWeights = [row['termWeights'] for row in lda_topics.collect()] # get the weights and indices of words from LDA topics in format of numpy array with 5*wordCount shape. # each element is the weight of the corresponding word in that specific topic. lsa_weightsMatrix = lsa_model.get_topics() # function to calculate the similarity between an lsa topic and an lda topic. def topic_similarity_calculator(lsa_t, lda_t): (lda_index, lda_weight) = lda_t sum = 0 for index, weight in zip(lda_index, lda_weight): sum = sum + (np.abs(lsa_t[index] * weight)) return sum # run the similarity function on 25 possibilities (5 LSA * 5 LDA) similarity = [] eachLSA = [] for i in range(0, 5): eachLSA.clear() for j in range(0, 5): temp = topic_similarity_calculator( lsa_weightsMatrix[i], (lda_wordIndices[j], lda_wordWeights[j])) eachLSA.append(temp) similarity.append(eachLSA[:]) # Print the similarity table # each row is a LDA topic and each column is an LSA topic. print(" ") print("Similarity table") def similarity_print(s): i = 1 print("|--------------------------------------------------------|") print("| | LSA 1 | LSA 2 | LSA 3 | LSA 4 | LSA 5 |") print("|--------------------------------------------------------|") for one, two, three, four, five in zip(*similarity): print( '|LDA {} | {:+1.4f} | {:+1.4f} | {:+1.4f} | {:+1.4f} | {:+1.4f} |' .format(i, one, two, three, four, five)) print("|--------------------------------------------------------|") i = i + 1 #creates the similarity matrix similarity_print(similarity) # ______________________________________________________________________________________________________________ # Final result Table # Manually found the following Topics to be similar # (LSA1 - LDA1) # (LSA5 - LDA2) # rest are alone lsa_words_idx = [] for idx, curr_topic in enumerate(lsa_weightsMatrix): lsa_words_idx.append(np.abs(curr_topic).argsort()[-10:][::-1]) lsa_topics_bow = {} lda_topics_bow = {} lsa_bow_list = [] lda_bow_list = [] for curr_idx, (lda_topic, lsa_topic) in enumerate(zip(lda_wordIndices, lsa_words_idx)): lsa_bow_list.clear() lda_bow_list.clear() for idx in range(10): lsa_bow_list.append(vectorizer.vocabulary[lsa_topic[idx]]) lda_bow_list.append(vectorizer.vocabulary[lda_topic[idx]]) lsa_topics_bow[curr_idx] = lsa_bow_list[:] lda_topics_bow[curr_idx] = lda_bow_list[:] results = [] names = [] # Creating word dictionary for LDA2 and LSA5 lda2_lsa5 = lda_topics_bow[1][:] for word in (lsa_topics_bow[4]): if word not in lda2_lsa5: lda2_lsa5.append(word) # Creating word dictionary for LDA1 and LSA1 lda1_lsa1 = lda_topics_bow[0][:] for word in (lsa_topics_bow[0]): if word not in lda1_lsa1: lda1_lsa1.append(word) results.append(lda1_lsa1) names.append("LDA1 - LSA1 ") results.append(lda2_lsa5) names.append("LDA2 - LSA5 ") results.append(lda_topics_bow[2]) names.append("LDA3 ") results.append(lda_topics_bow[3]) names.append("LDA4 ") results.append(lda_topics_bow[4]) names.append("LDA5 ") results.append(lsa_topics_bow[1]) names.append("LSA2 ") results.append(lsa_topics_bow[2]) names.append("LSA3 ") results.append(lsa_topics_bow[3]) names.append("LSA4 ") #printing the topics and related words print(" ") print("Topics Table") print( "|------------------------------------------------------------------------------------------|" ) print( "| Topic | Significant Words |" ) print( "|------------------------------------------------------------------------------------------|" ) for name, r in zip(names, results): print('| {} | {} |'.format(name, r)) print( "|------------------------------------------------------------------------------------------|" ) print(" ") print(" ")
print(input_data.take(4)) prepared_data = input_data.map(lambda x: (get_patent_name(x[1]), get_claims(x[1]))) \ .map(lambda x: (x[0], remove_punctuation(x[1]))) \ .map(lambda x: (x[0], remove_linebreaks(x[1]))) prepared_df = prepared_data.toDF().selectExpr('_1 as patent_name', '_2 as patent_claims') # Разбить claims на токены tokenizer = Tokenizer(inputCol="patent_claims", outputCol="words") words_data = tokenizer.transform(prepared_df) # Отфильтровать токены, оставив только слова filtered_words_data = words_data.rdd.map(lambda x: (x[0], x[1], get_only_words(x[2]))) filtered_df = filtered_words_data.toDF().selectExpr('_1 as patent_name', '_2 as patent_claims', '_3 as words') # Удалить стоп-слова (союзы, предлоги, местоимения и т.д.) remover = StopWordsRemover(inputCol='words', outputCol='filtered') filtered = remover.transform(filtered_df) vectorizer = CountVectorizer(inputCol='filtered', outputCol='raw_features').fit(filtered) featurized_data = vectorizer.transform(filtered) featurized_data.cache() idf = IDF(inputCol='raw_features', outputCol='features') idf_model = idf.fit(featurized_data) rescaled_data = idf_model.transform(featurized_data) # Вывести таблицу rescaled_data rescaled_data.show() spark.stop()
def makeWord2VecModel(): cursor = News.find({}) text = "" for news in cursor: text += news['text'] with open(os.path.join(os.getcwd(), 'word2Vec.txt'), 'w', encoding='utf-8') as inputFile: inputFile.writelines(text) spark = SparkSession.builder.appName("SimpleApplication").getOrCreate() # Построчная загрузка файла в RDD input_file = spark.sparkContext.textFile('word2Vec.txt') print(input_file.collect()) prepared = input_file.map(lambda x: ([x])) df = prepared.toDF() prepared_df = df.selectExpr('_1 as text') # Разбить на токены tokenizer = Tokenizer(inputCol='text', outputCol='words') words = tokenizer.transform(prepared_df) # Удалить стоп-слова stop_words = StopWordsRemover.loadDefaultStopWords('russian') remover = StopWordsRemover(inputCol='words', outputCol='filtered', stopWords=stop_words) filtered = remover.transform(words) # Вывести стоп-слова для русского языка print(stop_words) # Вывести таблицу filtered filtered.show() # Вывести столбец таблицы words с токенами до удаления стоп-слов words.select('words').show(truncate=False, vertical=True) # Вывести столбец "filtered" таблицы filtered с токенами после удаления стоп-слов filtered.select('filtered').show(truncate=False, vertical=True) # Посчитать значения TF vectorizer = CountVectorizer(inputCol='filtered', outputCol='raw_features').fit(filtered) featurized_data = vectorizer.transform(filtered) featurized_data.cache() vocabulary = vectorizer.vocabulary # Вывести таблицу со значениями частоты встречаемости термов. featurized_data.show() # Вывести столбец "raw_features" таблицы featurized_data featurized_data.select('raw_features').show(truncate=False, vertical=True) # Вывести список термов в словаре print(vocabulary) # Посчитать значения DF idf = IDF(inputCol='raw_features', outputCol='features') idf_model = idf.fit(featurized_data) rescaled_data = idf_model.transform(featurized_data) # Вывести таблицу rescaled_data rescaled_data.show() # Вывести столбец "features" таблицы featurized_data rescaled_data.select('features').show(truncate=False, vertical=True) # Построить модель Word2Vec word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol='words', outputCol='result') model = word2Vec.fit(words) w2v_df = model.transform(words) w2v_df.show() persons = [] cPersons = db.Persones.find({}) for secName in cPersons: persons.append(secName['sName']) synonyms = [] i = 0 synonyms.append(model.findSynonyms('погибла', 2)) for word, cosine_distance in synonyms: print(str(word)) spark.stop()
df = df.repartition(20) # # Use nltk.word_tokenizer to tokenize words # @udf(ArrayType(StringType())) # def tokenize(string): # return word_tokenize(string) # df = df.withColumn("words", tokenize("reviewText")) df = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W").transform(df) df = df.drop("reviewText") cv_model = CountVectorizer(inputCol="words", outputCol="tf").fit(df) vocabulary = cv_model.vocabulary df = cv_model.transform(df) df = df.drop("words") df.cache() df = IDF(inputCol="tf", outputCol="tfidf").fit(df).transform(df) df = df.drop("tf") df.unpersist() @udf(MapType(StringType(), FloatType())) def create_map(vector): zipped = zip(vector.indices, vector.values) return dict((vocabulary[int(x)], float(y)) for (x, y) in zipped) results = df.withColumn("tfidf", create_map("tfidf")) results.write.json("hdfs:/output/tfidf", mode="overwrite")
class TFIDF: def __init__(self): self.sparkSession = SparkSession\ .builder\ .appName("DocumentSearchEngine")\ .getOrCreate() self.sc = self.sparkSession.sparkContext self.tf = None # create vocabulary vocab_rdd = self.sparkSession.sparkContext.wholeTextFiles( "data/word_list.txt") wordlist = self.sparkSession.createDataFrame(vocab_rdd, ["text", "data"]) tokenizer = Tokenizer(inputCol="data", outputCol="words") wordsData = tokenizer.transform(wordlist) self.vocabModel = CountVectorizer( inputCol="words", outputCol="rawFeatures").fit(wordsData) self.word_to_id = dict() for id, word in enumerate(self.vocabModel.vocabulary): self.word_to_id[word] = id self.id_to_path = dict() # destructor atexit.register(self.cleanup) def get_vocabulary(self): return self.vocabModel.vocabulary def get_tf_idf(self, stage_folder): """ :param stage_folder_queue: :return: sqlobject of path and features """ print(stage_folder) documents_rdd = self.sparkSession.sparkContext.wholeTextFiles( stage_folder + "/*") #documents_rdd.show(20, False) #state.logger.debug("documents : %s",[each for each in documents_rdd.collect()]) documents = self.sparkSession.createDataFrame(documents_rdd, ["path", "text"]) tokenizer = Tokenizer(inputCol="text", outputCol="words") wordsData = tokenizer.transform(documents) #wordsData.select("path", "words").show(20,False) current_tf = self.vocabModel.transform(wordsData) #current_tf.show(20, False) if self.tf is not None: self.tf = self.tf.union(current_tf) else: self.tf = current_tf #self.tf.show(20, False) idf = IDF(inputCol="rawFeatures", outputCol="tfidf") idfModel = idf.fit(self.tf) self.tfidf = idfModel.transform(self.tf) #rescaledData.select("path", "features") ans = [] state.logger.debug( "TFIDF : %s", [each for each in self.tfidf.select("path", "tfidf").collect()]) for each in self.tfidf.select("path", "tfidf").collect(): ans.append(each) return ans def get_tf_idf_map(self, tfidf): # create mapping dict tf_idf_map = dict() file_to_id_map = dict() id_to_file_map = dict() index = 0 for row in tfidf: doc = row.path if doc not in file_to_id_map: id_to_file_map[index] = doc file_to_id_map[doc] = index index += 1 for word, score in zip(row.tfidf.indices, row.tfidf.values): if score > 0: if word not in tf_idf_map: tf_idf_map[word] = [] tf_idf_map[word].append((file_to_id_map[doc], score)) for key in tf_idf_map: tf_idf_map[key].sort(key=lambda x: -x[1]) return tf_idf_map, file_to_id_map, id_to_file_map, self.vocabModel.vocabulary, self.word_to_id def cleanup(self): self.sparkSession.stop()
sentenceData = spark.createDataFrame(all_data, ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) print "original words:" for words_label in wordsData.select("words", "label").take(printLines): print(words_label) ''' calculate temporal frequency ''' #hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) #featurizedData = hashingTF.transform(wordsData) cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", vocabSize=vocabNumber, minDF=minDFValue).fit(wordsData) featurizedData = cv.transform(wordsData) print "words after TF:" print cv.vocabulary for words_label in featurizedData.select("rawFeatures", "words").take(printLines): print(words_label) ''' calculate IDF ''' idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) dataset = idfModel.transform(featurizedData) print "words after IDF:" for features_label in dataset.select("features", "label").take(printLines): print(features_label) '''
def train_model_sentences_with_person(): sentences_with_person_collection = get_db_collection_object( 'SentencesWithPerson') with open("sentences_with_person.txt", "w", encoding='utf-8') as file_sentences_with_person: for sen in sentences_with_person_collection.find(): file_sentences_with_person.write('{0}\n'.format(sen['sentence'])) spark = SparkSession \ .builder \ .appName("SentenceProcessor") \ .getOrCreate() input_data = spark.sparkContext.textFile('./sentences_with_person.txt') prepared_data = input_data.map(lambda x: (x, len(x))) prepared_data = prepared_data.filter(lambda x: x[1] > 0) prepared_df = prepared_data.toDF().selectExpr('_1 as sentence', '_2 as length') # prepared_df.show(truncate=False) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") words_data = tokenizer.transform(prepared_df) # words_data.show(truncate=False) # Отфильтровать токены, оставив только слова filtered_words_data = words_data.rdd.map( lambda x: (x[0], x[1], get_only_words(x[2]))) filtered_df = filtered_words_data.toDF().selectExpr( '_1 as sentence', '_2 as length', '_3 as words') # filtered_df.show() # Удалить стоп-слова (союзы, предлоги, местоимения и т.д.) stop_words = stopwords.words('russian') remover = StopWordsRemover(inputCol='words', outputCol='filtered', stopWords=stop_words) filtered = remover.transform(filtered_df) # normalize_words_data = filtered.rdd.map( lambda x: (x[0], x[1], x[2], normalization_sentence(x[3]))) normalized_df = normalize_words_data.toDF().selectExpr( '_1 as sentence', '_2 as length', '_3 as words', '_4 as normalize_words') # normalized_df.show() # vectorizer = CountVectorizer(inputCol='normalize_words', outputCol='raw_features').fit(normalized_df) featurized_data = vectorizer.transform(normalized_df) featurized_data.cache() # idf = IDF(inputCol='raw_features', outputCol='features') idf_model = idf.fit(featurized_data) rescaled_data = idf_model.transform(featurized_data) # Построить модель Word2Vec word2Vec = Word2Vec(vectorSize=300, minCount=0, inputCol='normalize_words', outputCol='result') doc2vec_pipeline = Pipeline(stages=[tokenizer, word2Vec]) model = word2Vec.fit(rescaled_data) w2v_df = model.transform(rescaled_data) # w2v_df.show(truncate=False) # print(model.findSynonyms('бочаров', 2).show()) # sc = spark.sparkContext path = './models/model_person' # # print(sc, path) model.write().overwrite().save(path) #m = Word2Vec.load('./models/model_person/') # pickle.dump(model, './models/model_person/mp.model') spark.stop()
testing_set_raw = df_filtered.filter( df_filtered.id >= training_end_idx).repartition(partitions_no) # 2. Create vocabulary log.warn("Building vocabulary") cv_model = CountVectorizer(inputCol="filtered", outputCol="vectors", minDF=minDF, vocabSize=vocab_size).fit(training_set_raw) V = len(cv_model.vocabulary) log.warn("Vocabulary size = {0}".format(V)) # 3. Transform documents to BOW representation: # each doc is represented as SparseVector: (vocabSize, {word_id:count, word_id:count,...} log.warn("Transform training dataset to bow representation") training_set = cv_model.transform(training_set_raw).select( 'id', 'vectors').cache() log.warn('Training set: {0} documents'.format(training_set.count())) training_set_local = training_set.collect() # 4. Initialize model: # 4.1 each doc represented by (id, z_n array (topic to word assignment) and c_k_m (topics distribution for doc) # 4.2 randomly assign topic to each word in document, increment c_k_m accordingly z_m_n = training_set.rdd.map(init, preservesPartitioning=True).cache() z_m_n_matrix = z_m_n.flatMap(word_topics).reduceByKey( lambda a, b: a + b).collect() c_k_global = update_c_k(z_m_n_matrix) c_k_n_global = update_c_k_n(z_m_n_matrix) c_k_m_x = get_c_k_m_x( z_m_n.map(lambda x: (x[0], x[2])).sortByKey(ascending=True).collect())
# ### Sample of 2-word nGrams on Maintenance Notes tk = Tokenizer(inputCol="note", outputCol="words") # Tokenize maintTokenized = tk.transform(maintenance) swr = StopWordsRemover(inputCol="words", outputCol="filtered") # Remove stop-words maintFiltered = swr.transform(maintTokenized) ngram = NGram(n=2, inputCol="filtered", outputCol="ngrams") # 2-word nGrams maintNGrams = ngram.transform(maintFiltered) maintNGrams.select('ngrams').show(5, truncate=False) # ### Topic Clustering using Latent Dirichlet Allocation (LDA) # LDA is a form of un-supervised machine learning that identifies clusters, or topics, # in the data cv = CountVectorizer(inputCol="ngrams", outputCol="features", vocabSize=50)\ .fit(maintNGrams) # CountVectorize converts nGram array into a vector of counts maintVectors = cv.transform(maintNGrams) vocabArray = cv.vocabulary lda = LDA(k=3, maxIter=10) ldaModel = lda.fit(maintVectors) ldaModel.write().overwrite().save('lda.mdl') topics = ldaModel.describeTopics(5) # We see below that each maintenance log can be clustered based on its text into # 1 of 3 topics below. The nGrams in each cluster show clearly 3 types of maintenance # activities # 1. Preventive maintenance occurs when the we have 'abnormal readings' or a 'component replacement' # 2. Corrective maintenance occurs when we have a 'asset shutdown' event or 'asset failure' # 3. The rest of the logs indicate that no downtime is required (ie. 'maintenance tests passed', 'asset healthy') for topic in topics.collect(): print('Topic %d Top 5 Weighted nGrams' % (topic[0]+1))
def main(train_x, train_y, test_x, test_y=None, base='gs'): # generate joint feature set train_features = elizabeth.preprocess.load(train_x, train_y, base=base, kind='joint').drop('url') test_features = elizabeth.preprocess.load(test_x, test_y, base=base, kind='joint').drop('url') train_features.show() token_counter = CountVectorizer(inputCol='features', outputCol='tokenCounts', minDF=10).fit(train_features) train = token_counter.transform(train_features).drop('features') test = token_counter.transform(test_features).drop('features') # convert the string labels to numeric indices # the handleInvalid param allows the label indexer to deal with labels that weren't seen during fitting label_indexer = StringIndexer(inputCol='label', outputCol='indexedLabel', handleInvalid="skip") label_indexer = label_indexer.fit(train) train = label_indexer.transform(train) # the test set won't always have labels if test_y is not None: test = label_indexer.transform(test) index_labeller = IndexToString(inputCol='prediction', outputCol='predictedClass', labels=label_indexer.labels) # create and train a Random Forest classifier rf = RandomForestClassifier(labelCol='indexedLabel', featuresCol='tokenCounts', numTrees=20, maxDepth=10, minInfoGain=0.0, seed=12345) model = rf.fit(train) prediction = model.transform(test) prediction = index_labeller.transform( prediction) # DF[id, url, ... prediction, predictedClass] # If labels are given for the test set, print a score.s if test_y: evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol='prediction', metricName='accuracy') accuracy = evaluator.evaluate(prediction) print("\n\tAccuracy on test set: %0.6f\n" % accuracy) # If no labels are given for the test set, print predictions. else: prediction = prediction.orderBy(prediction.id).select( prediction.predictedClass) prediction = prediction.rdd.map( lambda prediction: int(prediction.predictedClass)) prediction = prediction.toLocalIterator() print(*prediction, sep='\n')
class BM25Model(object): """ Computes BM25 score. """ def __init__(self, k=1.2, b=.75): self.k = k self.b = b self.tok = Tokenizer(inputCol='__input', outputCol='__tokens') self.vec = CountVectorizer(inputCol='__tokens', outputCol='__counts') self.idf = IDF(inputCol='__counts', outputCol='__idf') self.train_col = None self.udf = None self.is_fit = False def fit(self, df, train_col): """ Does fitting on input df. df: a pyspark dataframe. train_col (string): The name of the column containing training documents. Returns: self, a """ self.train_col = train_col df_ = self.tok.transform(df.withColumnRenamed(train_col, '__input')) mean_dl = df_.select(F.mean(F.size(F.col('__tokens')))).collect()[0][0] self.vec = self.vec.fit(df_) df_ = self.vec.transform(df_) self.idf = self.idf.fit(df_) #this will reset value of self.udf to be a working udf function. exec(udf_template.format(mean_dl, self.k, self.b)) self.is_fit = True return self def transform(self, df, score_col, bm25_output_name='bm25', tf_output_name=None, ntf_output_name=None, tfidf_output_name=None): """ Computes BM25 score, along with normalized term frequency (ntf) and tfidf. These three additional scores come "for free" with bm25 but are only returned optionally. """ if not self.is_fit: raise Exception( "You must fit the BM25 model with a call to .fit() first.") columns = df.columns df_ = self.tok.transform(df.withColumnRenamed(score_col, '__input')) df_ = self.vec.transform(df_) df_ = self.idf.transform(df_) df_ = (df_.withColumnRenamed( '__counts', '__query_counts').withColumnRenamed( '__input', score_col)).select(columns + [score_col, '__query_counts', '__idf']) df_ = self.tok.transform( df_.withColumnRenamed(self.train_col, '__input')) df_ = self.vec.transform(df_) df_ = df_.withColumnRenamed('__counts', '__item_counts') df_ = df_.withColumn( 'bm25', self.udf(F.col('__query_counts'), F.col('__item_counts'), F.col('__idf'))) df_ = df_.withColumnRenamed('__input', self.train_col) computed_values = df_.withColumn( 'more', F.explode(F.array(F.col('bm25')))).select(columns + ['bm25.*']) #this is logic for naming output column(s) final_selection = columns if bm25_output_name is not None: computed_values = computed_values.withColumnRenamed( 'bm25', bm25_output_name) final_selection.append(bm25_output_name) if tf_output_name is not None: computed_values = computed_values.withColumnRenamed( 'tf', tf_output_name) final_selection.append(tf_output_name) if ntf_output_name is not None: computed_values = computed_values.withColumnRenamed( 'ntf', ntf_output_name) final_selection.append(ntf_output_name) if tfidf_output_name is not None: computed_values = computed_values.withColumnRenamed( 'tfidf', tfidf_output_name) final_selection.append(tfidf_output_name) return computed_values.select(final_selection)
def compute(sc, topLeft, bottomRight, step, datasetPath, k, gfs): sqlContext = SQLContext(sc) data = sc.textFile(datasetPath) data = data.mapPartitions(lambda x: csv.reader(x)) header = data.first() data = data.filter(lambda x: x != header) result_to_write = [] res_computation = [] step = check_step(topLeft, bottomRight, step) squares = get_squares(topLeft, bottomRight, step) # start computing elapsed time here start_time = time.time() data = data.map(lambda x: is_inside(x, topLeft, bottomRight, step, squares)). \ filter(lambda x: x is not None) data = data.map(remove_punctuation). \ map(split_string_into_array). \ filter(remove_empty_array). \ map(create_row). \ groupByKey(). \ map(lambda x : (x[0], list(x[1]))) # create the dataframes allDf = [] for df in data.collect(): if df: allDf.append([df[0], sqlContext.createDataFrame(df[1])]) for docDFs in allDf: docDF = docDFs[1] squareId = docDFs[0] StopWordsRemover.loadDefaultStopWords('english') newDocDF_eng = StopWordsRemover(inputCol="words", outputCol="filtered_eng"). \ transform(docDF) newDocDF_eng = newDocDF_eng.drop('words') StopWordsRemover.loadDefaultStopWords('italian') newDocDF_ita = StopWordsRemover(inputCol="filtered_eng", outputCol="filtered_ita"). \ transform(newDocDF_eng) newDocDF_ita = newDocDF_ita.drop('filtered_eng') StopWordsRemover.loadDefaultStopWords('german') newDocDF_ger = StopWordsRemover(inputCol="filtered_ita", outputCol="filtered_ger"). \ transform(newDocDF_ita) newDocDF_ger = newDocDF_ger.drop('filtered_ita') model = CountVectorizer(inputCol="filtered_ger", outputCol="vectors"). \ fit(newDocDF_ger) result = model.transform(newDocDF_ger) corpus = result.select("idd", "vectors").rdd.map(create_corpus).cache() # cluster the documents into the k topics using LDA ldaModel = LDA.train(corpus, k=k, maxIterations=100, optimizer='online') vocabArray = model.vocabulary wordNumbers = 10 # number of words per topic topicIndices = sc.parallelize( ldaModel.describeTopics(maxTermsPerTopic=wordNumbers)) toBePrinted = min(len(vocabArray), wordNumbers) topics_final = topicIndices.map( lambda x: topic_render(x, toBePrinted, vocabArray)).collect() # compute labels topics_label = [] for topic in topics_final: for topic_term in topic: if topic_term not in topics_label: topics_label.append(topic_term) break # print topics s = "; " res = "{}, {}, {}, {}, {}".format(topLeft.x, topLeft.y, bottomRight.x, bottomRight.y, s.join(topics_label)) result_to_write.append(res) res_computation.append(topics_label) end_time = time.time() elapsed_time = end_time - start_time result_to_write.append(elapsed_time) to_write = sc.parallelize(result_to_write) # get dataset size from file name size = datasetPath.split('.')[0].split('_')[1] if gfs: output_folder = "/tmp/Topic_Zoomer_" + str( time.ctime(start_time)).replace(' ', '_').replace(':', '-') + '_' + size else: output_folder = "Topic_Zoomer_" + str(time.ctime(start_time)).replace( ' ', '_').replace(':', '-') + '_' + size to_write.saveAsTextFile(output_folder) if gfs: copyHdfsCmd = 'hdfs dfs -copyToLocal {} {}'.format( output_folder, output_folder) copyBucketCmd = 'gsutil cp -r {} {}'.format(output_folder, gfs_output_path_hdfs) copyRecBucketCmd = 'gsutil cp -r {} {}'.format(recFileFolder, gfs_output_path_hdfs) copyHdfsRes = subprocess.call(shlex.split(copyHdfsCmd)) copyBucketRes = subprocess.call(shlex.split(copyBucketCmd)) copyRecBucketRes = subprocess.call(shlex.split(copyRecBucketCmd)) # some exit code checks if copyBucketRes or copyHdfsRes or copyRecBucketRes: print('hdfsRes: {}'.format(copyHdfsRes)) print('bucketResComp: {}'.format(copyBucketRes)) print('bucketResRec: {}'.format(copyRecBucketRes)) print('Something went wrong while copying results') return res_computation