except: logger.error("Can't input dataset") # Join posts_df and tags_df together and prepare training dataset selected_tags_df = tags_df.filter(tags_df.Tag.isin( tags_set.value)).na.drop(how='any') tags_questions_df = selected_tags_df.join(posts_df, "Id") training_df = tags_questions_df.select(['Tag', 'Body', 'Id']).na.drop(how='any') logger.debug("successfully get training_df") # tokenize post texts and get term frequency and inverted document frequency logger.debug("Start to generate TFIDF features") tokenizer = Tokenizer(inputCol="Body", outputCol="Words") tokenized_words = tokenizer.transform(training_df.na.drop(how='any')) tokenizer.save(tokenizer_file) hashing_TF = HashingTF(inputCol="Words", outputCol="Features", numFeatures=20000) #, numFeatures=200 hashing_TF.save(hashing_tf_file) TFfeatures = hashing_TF.transform(tokenized_words.na.drop(how='any')) idf = IDF(inputCol="Features", outputCol="IDF_features") idfModel = idf.fit(TFfeatures.na.drop()) idfModel.save(idf_model_file) TFIDFfeatures = idfModel.transform(TFfeatures.na.drop(how='any')) logger.debug("Get TFIDF features successfully") # for feature in TFIDFfeatures.select("IDF_features", "Tag").take(3): # logger.info(feature) =