from featurizer.mult_featurizer import MultiFeaturizer from featurizer.bert_featurizer import BertFeaturizer from featurizer.tfidf_featurizer import TfidfFeaturizer from featurizer.onehot_featurizer import OneHotFeaturizer from pyspark.ml.evaluation import MulticlassClassificationEvaluator if __name__ == '__main__': spark = SparkSession.builder \ .appName('Spark SQL and DataFrame') \ .getOrCreate() # Load training data # dataPath = '../../example_data/twitter/20190528sentences_data_integrated.csv' dataPath = '../../example_data/twitter_2020-03-10.csv' df = load_as_df(dataPath, twitter_schema) converted_df = shape_df(spark, df).drop("age") converted_df.show(3) # model_path = "../../param/word2vec/entity_vector/entity_vector.model.bin" # wv = Word2VecFeaturizer(spark, model_path) # feat_df = wv.featurize(converted_df) model_path = "../../param/word2vec/twitter_model/w2v_gensim/word2vec_tweet.model" wv_tweet = Word2VecFeaturizer(spark, model_path, False) # feat_df = wv_tweet.featurize(converted_df) # model_path = "../../param/word2vec/niconico_model/nico_vec.bin" # wv_nico = Word2VecFeaturizer(spark, model_path, False) # feat_df = wv_nico.featurize(converted_df) # model_path = "../../param/bert/Japanese_L-24_H-1024_A-16_E-30_BPE_WWM_transformers" # bert = BertFeaturizer(spark, model_path) # feat_df = bert.featurize(converted_df) converted_df2 = shape_df(spark, df, 'nagisa', ['補助記号']).drop("age") tfidf = TfidfFeaturizer(spark)
from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.classification import GBTClassifier if __name__ == '__main__': spark = SparkSession.builder \ .appName('Spark SQL and DataFrame') \ .getOrCreate() # Load training data # dataPath = '../../example_data/twitter/20190528sentences_data_integrated.csv' dataPath = '../../example_data/twitter_2020-03-10.csv' df = load_as_df(dataPath, twitter_schema) converted_df = shape_df(spark, df).drop("age") converted_df.show(3) # model_path = "../../param/word2vec/entity_vector/entity_vector.model.bin" # wv = Word2VecFeaturizer(spark, model_path) # feat_df = wv.featurize(converted_df) model_path = "../../param/word2vec/twitter_model/w2v_gensim/word2vec_tweet.model" wv_tweet = Word2VecFeaturizer(spark, model_path, False) feat_df = wv_tweet.featurize(converted_df) # model_path = "../../param/bert/Japanese_L-24_H-1024_A-16_E-30_BPE_WWM_transformers" # bert = BertFeaturizer(spark, model_path) # feat_df = bert.featurize(converted_df) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = feat_df.randomSplit([0.8, 0.2], seed=2) gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=15)
for word in node_list: tmp_list.append(word) if len(tmp_list) != 0: label_list.append(float(data[0])) bert_tokens = bert_tokenizer.tokenize( " ".join(["[CLS]"] + tmp_list + ["[SEP]"])) token_ids = bert_tokenizer.convert_tokens_to_ids(bert_tokens) tokens_tensor = torch.tensor(token_ids).unsqueeze(0) all_outputs = bert_model(tokens_tensor) embedding = all_outputs[-2].detach().numpy()[0] vec = np.mean(embedding, axis=0).tolist() vec_list.append(Vectors.dense(vec)) zip_list = zip(label_list, vec_list) new_df = self.spark.createDataFrame(zip_list, ("label", "features")) return new_df if __name__ == '__main__': sc = SparkSession.builder\ .appName('Spark SQL and DataFrame')\ .getOrCreate() df = sc.createDataFrame([ (21, "male", "友達が作ってくれたビネの白ドレス可愛すぎてたまらん😍"), (30, "female", "できればダブりたくないが初期の方のLRは避けたい"), (40, "male", "だから一生孤独でも構わんよ親にも作れと言われているけど"), ], ("age", "sex", "sentence")) converted_df = shape_df(sc, df).drop('age') data_path = "../param/bert/Japanese_L-24_H-1024_A-16_E-30_BPE_WWM_transformers" bert = BertFeaturizer(sc, data_path) result_df = bert.featurize(converted_df) result_df.show(3)
tmp_list.append(word) if len(tmp_list) != 0: label_list.append(data[0]) wakati_list.append(tmp_list) self.global_dict.add_documents(wakati_list) dim = len(self.global_dict) vec_list = [] for wakati in wakati_list: vec = [0 for _ in range(dim)] for word in wakati: vec[self.global_dict.token2id[word]] = 1 vec_list.append(Vectors.dense(vec)) zip_list = zip(label_list, vec_list) new_df = self.spark.createDataFrame(zip_list, ("label", "features")) return new_df if __name__ == '__main__': spark = SparkSession.builder\ .appName('Spark SQL and DataFrame')\ .getOrCreate() df = spark.createDataFrame([ (21, "male", "友達が作ってくれたビネの白ドレス可愛すぎてたまらん😍"), (30, "female", "できればダブりたくないが初期の方のLRは避けたい"), (40, "male", "だから一生孤独でも構わんよ親にも作れと言われているけど"), ], ("age", "sex", "sentence")) converted_df = shape_df(spark, df).drop('age') oneHot = OneHotFeaturizer(spark) result_df = oneHot.featurize(converted_df) result_df.show(3)