def tfidf_vectors(input_col: str, output_col: str, df: DataFrame, num_features=262144): """Calculate the tfidf vectors for the given input tokens""" # the invese document frequency can be calculated using # only the term frequency, essentially it is a column wise # operation over every term in the corpus idf = IDF(minDocFreq=0, inputCol=input_col, outputCol=output_col).fit(df) return idf.transform(df)
def get_product_similarity(self): """ Calculate the similarity between items/users """ product_taxonomy = self.data.select(self.productCol, self.taxonomyCol).distinct() product_taxonomy = self.__data_manipulation(product_taxonomy) hashingTF = HashingTF(inputCol=self.taxonomyCol, outputCol="tf") tf = hashingTF.transform(product_taxonomy) idf = IDF(inputCol="tf", outputCol="feature").fit(tf) tfidf = idf.transform(tf) normalizer = Normalizer(inputCol="feature", outputCol="norm") norma_data = normalizer.transform(tfidf) col1 = "i." + self.productCol col2 = "j." + self.productCol dot_udf = udf(lambda x, y: float(x.dot(y)), DoubleType()) result = norma_data.alias("i").crossJoin(norma_data.alias("j"))\ .select( col(col1).alias("i"), col(col2).alias("j"), dot_udf("i.norm", "j.norm").alias("dot"))\ .sort("i", "j") result = result.filter(result.i < result.j & result.dot > 0.5) return result
def __data_manipulation(self, col): data = self.data.select(col, self.taxonomyCol).distinct() data = data.withColumn(self.taxonomyCol, data[self.taxonomyCol].cast(StringType())) concat_list = udf(lambda lst: ", ".join(lst), StringType()) data = data.groupby(col).agg( collect_list(self.taxonomyCol).alias(self.taxonomyCol)) data = data.withColumn(self.taxonomyCol, concat_list(self.taxonomyCol)) data = data.withColumn( self.taxonomyCol, split(regexp_replace(self.taxonomyCol, " ", ""), ',')) hashingTF = HashingTF(inputCol=self.taxonomyCol, outputCol="tf") tf = hashingTF.transform(data) idf = IDF(inputCol="tf", outputCol="feature").fit(tf) tfidf = idf.transform(tf) normalizer = Normalizer(inputCol="feature", outputCol="norm") norma_data = normalizer.transform(tfidf) return norma_data
def tf_idf(data_rdd): """ Calculate term frequency–inverse document frequency for reflecting importance of words in Tweet. :param data_rdd: input data rdd :return: transformed dataframe """ data_rdd_df = data_rdd.toDF() hashing_tf = HashingTF(inputCol="words", outputCol="tf_features") tf_data = hashing_tf.transform(data_rdd_df) idf_data = IDF(inputCol="tf_features", outputCol="features").fit(tf_data) tf_idf_data = idf_data.transform(tf_data) return tf_idf_data.select(["label", "words", "features"])
#Performing Tokenization for the data tokenizer = Tokenizer(inputCol="document", outputCol="words") wordsData = tokenizer.transform(documentData) wordsData.show() """# **2.a) Performing the task without NLP**""" # applying tf on the words data hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=200) tf = hashingTF.transform(wordsData) # alternatively, CountVectorizer can also be used to get term frequency vectors # calculating the IDF tf.cache() idf = IDF(inputCol="rawFeatures", outputCol="features") idf = idf.fit(tf) tfidf = idf.transform(tf) #displaying the results tfidf.select("label", "features").show() print("TF-IDF without NLP:") for each in tfidf.collect(): print(each) print(each['rawFeatures']) spark.stop() """# **2.b) Performing the task with Lemmatization**""" import nltk;nltk.download('punkt');nltk.download('wordnet') from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer()
text_train = sc.textFile(input_file_train) pure_text_train = text_train.filter(deleteFirstRow) genre_and_sentences_after_flatmap = pure_text_train.flatMap(extractGenreAndSentencesForFlatmap) genre_and_sentences_after_flatmap.persist() # TFIDF tfidf_dataFrame = genre_and_sentences_after_flatmap.toDF(["genre","sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") tfidf_words_data = tokenizer.transform(tfidf_dataFrame) hashing_tf = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=512) tfidf_featurized_data = hashing_tf.transform(tfidf_words_data) idf_model = IDF(inputCol="rawFeatures", outputCol="features").fit(tfidf_featurized_data) tfidf_rescaled_data = idf_model.transform(tfidf_featurized_data) tfidf_genre_features = tfidf_rescaled_data.select("genre", "features") # Confusion matrix for TFIDF tfidf_kmeansmodel = KMeans().setK(5).setFeaturesCol('features').setPredictionCol('prediction').fit(tfidf_genre_features) tfidf_predictions = tfidf_kmeansmodel.transform(tfidf_genre_features).select("prediction", "genre") tfidf_res = tfidf_predictions.groupBy(['prediction', 'genre']).count().collect() print("Confusion matrix for TFIDF:") toPrint(tfidf_res) print() ####################################################################### ## Vocabulary Exploration - Part B ## ####################################################################### # pretrained
input_rdd = sc.textFile(train_path).map(split_train) train_hive_info = hiveCtx.createDataFrame(input_rdd, ['label', 'text']) split = Tokenizer(inputCol="text", outputCol="words") wordsData = split.transform(train_hive_info) my_print('分词完成.......') # 增加TF特征列 hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=2**10) TF_data = hashingTF.transform(wordsData) my_print('TF特征构造完成.......') # 增加IDF特征列 idf = IDF(inputCol="rawFeatures", outputCol="features").fit(TF_data) final_input_data = idf.transform(TF_data) my_print('IDF特征构造完成.......') train_rdd = final_input_data.select("label", "features") \ .rdd.map(lambda (label, features): (LabeledPoint(label, features.toArray()))) if model_name == 'LogisticRegression': model = LogisticRegressionWithLBFGS.train(train_rdd, numClasses=10) model.save(sc, model_path) elif model_name == 'NaiveBayes': model = NaiveBayes.train(train_rdd) model.save(sc, model_path) else: model = RandomForest.trainClassifier(train_rdd, 10, {}, 10, seed=42)
class BM25Model(object): """ Computes BM25 score. """ def __init__(self, k=1.2, b=.75): self.k = k self.b = b self.tok = Tokenizer(inputCol='__input', outputCol='__tokens') self.vec = CountVectorizer(inputCol='__tokens', outputCol='__counts') self.idf = IDF(inputCol='__counts', outputCol='__idf') self.train_col = None self.udf = None self.is_fit = False def fit(self, df, train_col): """ Does fitting on input df. df: a pyspark dataframe. train_col (string): The name of the column containing training documents. Returns: self, a """ self.train_col = train_col df_ = self.tok.transform(df.withColumnRenamed(train_col, '__input')) mean_dl = df_.select(F.mean(F.size(F.col('__tokens')))).collect()[0][0] self.vec = self.vec.fit(df_) df_ = self.vec.transform(df_) self.idf = self.idf.fit(df_) #this will reset value of self.udf to be a working udf function. exec(udf_template.format(mean_dl, self.k, self.b)) self.is_fit = True return self def transform(self, df, score_col, bm25_output_name='bm25', tf_output_name=None, ntf_output_name=None, tfidf_output_name=None): """ Computes BM25 score, along with normalized term frequency (ntf) and tfidf. These three additional scores come "for free" with bm25 but are only returned optionally. """ if not self.is_fit: raise Exception( "You must fit the BM25 model with a call to .fit() first.") columns = df.columns df_ = self.tok.transform(df.withColumnRenamed(score_col, '__input')) df_ = self.vec.transform(df_) df_ = self.idf.transform(df_) df_ = (df_.withColumnRenamed( '__counts', '__query_counts').withColumnRenamed( '__input', score_col)).select(columns + [score_col, '__query_counts', '__idf']) df_ = self.tok.transform( df_.withColumnRenamed(self.train_col, '__input')) df_ = self.vec.transform(df_) df_ = df_.withColumnRenamed('__counts', '__item_counts') df_ = df_.withColumn( 'bm25', self.udf(F.col('__query_counts'), F.col('__item_counts'), F.col('__idf'))) df_ = df_.withColumnRenamed('__input', self.train_col) computed_values = df_.withColumn( 'more', F.explode(F.array(F.col('bm25')))).select(columns + ['bm25.*']) #this is logic for naming output column(s) final_selection = columns if bm25_output_name is not None: computed_values = computed_values.withColumnRenamed( 'bm25', bm25_output_name) final_selection.append(bm25_output_name) if tf_output_name is not None: computed_values = computed_values.withColumnRenamed( 'tf', tf_output_name) final_selection.append(tf_output_name) if ntf_output_name is not None: computed_values = computed_values.withColumnRenamed( 'ntf', ntf_output_name) final_selection.append(ntf_output_name) if tfidf_output_name is not None: computed_values = computed_values.withColumnRenamed( 'tfidf', tfidf_output_name) final_selection.append(tfidf_output_name) return computed_values.select(final_selection)
##creating rdd file sc = SparkContext("local", "app") sqc = SQLContext(sc) df = sqc.createDataFrame(data, ['type', 'text']) #NEW VARIABLE GENERATION dataCleaned = df.map(lambda x: (1 if x['type'] == 'spam' else 0, tokenize(x['text']))) dataClean = dataCleaned.map(lambda x: (float(x[0]), x[1])) dfClean = sqc.createDataFrame(dataClean, ['label', 'words']) dfClean.show(5) hashingTF = HashingTF(inputCol="words", outputCol="rawtf-idf", numFeatures=1000) tf = hashingTF.transform(dfClean) idf = IDF(inputCol="rawtf-idf", outputCol="features").fit(tf) dfFinal = idf.transform(tf) # Fit on whole dataset to include all labels in index. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(dfFinal) # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(dfFinal) # Split the data into training and test sets (20% held out for testing) (trainingData, testData) = dfFinal.randomSplit([0.8, 0.2]) # Train the model. #rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") nb = NaiveBayes(smoothing = 1.0, labelCol="indexedLabel", featuresCol="indexedFeatures")
#df0.show() print('The number of jobs:',df0.count()) print('\nthe distinct jobs name: ', df1.job.unique()) print('\nThere are', len(df1.job.unique())-1, 'different kinds of jobs in the table.') # split the desc field tokenizer = Tokenizer(inputCol='desc_clean', outputCol='desc_words') df = tokenizer.transform(df0) #df.show() #df.select('desc_words').show(10) # compute TF-IDF hashingTF = HashingTF(inputCol='desc_words', outputCol='desc_words_tf') tf = hashingTF.transform(df).cache() idf = IDF(inputCol='desc_words_tf', outputCol='desc_words_tfidf').fit(tf) tfidf = idf.transform(tf).cache() #print('tfidf for each job:', tfidf.select('desc_words_tfidf').show(10,truncate=False)) # data normalization from pyspark.ml.feature import Normalizer normalizer = Normalizer(inputCol="desc_words_tfidf", outputCol="norm") tfidf = normalizer.transform(tfidf) #tfidf.select("id", "norm").show(6) # compute similarity between jobs and resume import pyspark.sql.functions as psf from pyspark.sql.types import DoubleType print('\nCompute the similarity between jobs and resume...') dot_udf = psf.udf(lambda x,y: float(x.dot(y)), DoubleType()) # define dot-product function tfidf = tfidf.alias("a1").join(tfidf.alias("a2"), psf.col("a1.id") == 0)\ .select(
tokenizer = Tokenizer(inputCol="text", outputCol="words") wordsData = tokenizer.transform(sentenceData) wordsData.show(5) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") featurizedData = hashingTF.transform(wordsData) featurizedData.show(10) featurizedData.printSchema() featurizedData.cache() #idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = IDF(inputCol="rawFeatures", outputCol="features").fit(featurizedData) #idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) dataset = rescaledData.select("features") from pyspark.ml.clustering import KMeans # Trains a k-means model. kmeans = KMeans().setK(10).setSeed(1) model = kmeans.fit(dataset) # Evaluate clustering by computing Within Set Sum of Squared Errors. wssse = model.computeCost(dataset) print("Within Set Sum of Squared Errors = " + str(wssse)) # Shows the result. centers = model.clusterCenters() print("Cluster Centers: ") for center in centers:
df = spark.read.format("csv").option("inferschema", "true").option( "header", "true").option("delimiter", "\t").load("trainReviews.tsv") tokenizer = Tokenizer(inputCol="text", outputCol="words") wordsData = tokenizer.transform(df) wordsData.show(5) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") tf = hashingTF.transform(wordsData) tf.show(10) tf.head().rawFeatures idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2).fit(tf) tfidf = idf.transform(tf) ml = LogisticRegression(featuresCol="features", labelCol='category', regParam=0.01) mlModel = ml.fit(tfidf.limit(5000)) res_train = mlModel.transform(tfidf) extract_prob = F.udf(lambda x: float(x[1]), T.FloatType()) res_train.withColumn("proba", extract_prob("probability")).select( "id", "proba", "prediction").show() test_df = spark.read.format("csv").option("inferschema", "true").option( "header", "true").option("delimiter", "\t").load("testReviews.tsv") tokenizer = Tokenizer(inputCol="text", outputCol="words") wordsData = tokenizer.transform(test_df)