def lr_example():
    min_freq = 1
    n_common = 10

    pwd = os.path.dirname(os.path.abspath(__file__))
    path = pwd + '/example_data/twitter_2020-03-10_slim.csv'
    print(path)
    df = csv_parser.load_as_df(path, twitter_schema)
    df.show(3)

    converted = featurizer.convert_df_to_feature(
        df, n_common, min_freq).filter(
            lambda row: row['age'] is not None and row['feature'] is not None)
    converted = converted.map(
        # (age, sex, feature)
        lambda row: LabeledPoint(row['age'], concat_vectors(row['feature'])))
    converted = converted.zipWithIndex()

    sample = converted.take(3)

    train_rdd = converted.filter(lambda x: x[1] % 2 == 0).map(lambda x: x[0])

    feature_dim = len(train_rdd.first().features)

    test_rdd = converted.filter(lambda x: x[1] % 2 == 1).map(lambda x: x[
        0]).filter(lambda x: len(x.features) == feature_dim).collect()

    print("confirming dim of train rdd")
    sample = train_rdd.take(3)
    for e in sample:
        print(e.features)
        print(len(e.features))

    lrm = LinearRegressionWithSGD.train(train_rdd)
    n = len(test_rdd)

    mse = 0
    # テスト
    for lp in test_rdd:
        gt = lp.label
        feat = lp.features
        pred = lrm.predict(feat)
        print(gt, pred)
        mse += (pred - gt) * (pred - gt)

    import math
    rmse = math.sqrt(mse / n)

    print('Root mean square error: ' + str(rmse))
Example #2
0
'''
識別モデル構築の流れ (参考:https://qiita.com/MahoTakara/items/b3d719ed1a3665730826, https://qiita.com/Hironsan/items/2466fe0f344115aff177)

1.  DONE: 単語分割
2.  DONE: 形態素解析
3.  DONE: クリーニング
4.  DONE: 正規化 (ストップワードの除去含む)
5.  DONE: 辞書作成 (単語とIDの対応づけ) https://qiita.com/tatsuya-miyamoto/items/f505dfa8d5307f8c6e98 簡単にできそう
6.  DONE: ベクトル化 (埋め込み or IDからone-hot)
7.  TODO: 文章特徴抽出 (文章の長さなど) 
8.  TODO: 提案特徴抽出 (フォロー/フォロワーの特徴)
9.  TODO: 識別モデルの実装
10. TODO: 評価メトリクスの実装
11. TODO: 実験実行
'''

if __name__ == "__main__":
    import os
    pwd = os.path.dirname(os.path.abspath(__file__))
    from parser.csv_parser import load_as_df
    from schema.twitter_schema import twitter_schema

    path = pwd + '/../example_data/20190528sentences_data_integrated.csv'
    df = load_as_df(path, twitter_schema)
    df.show(3)

    converted = convert_df_to_feature(df)
    sample = converted.take(3)
    for e in sample:
        print(e)
from featurizer.word2vec_featurizer import Word2VecFeaturizer
from featurizer.mult_featurizer import MultiFeaturizer
from featurizer.bert_featurizer import BertFeaturizer
from featurizer.tfidf_featurizer import TfidfFeaturizer
from featurizer.onehot_featurizer import OneHotFeaturizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

if __name__ == '__main__':

    spark = SparkSession.builder \
        .appName('Spark SQL and DataFrame') \
        .getOrCreate()
    # Load training data
    # dataPath = '../../example_data/twitter/20190528sentences_data_integrated.csv'
    dataPath = '../../example_data/twitter_2020-03-10.csv'
    df = load_as_df(dataPath, twitter_schema)
    converted_df = shape_df(spark, df).drop("age")
    converted_df.show(3)
    # model_path = "../../param/word2vec/entity_vector/entity_vector.model.bin"
    # wv = Word2VecFeaturizer(spark, model_path)
    # feat_df = wv.featurize(converted_df)
    model_path = "../../param/word2vec/twitter_model/w2v_gensim/word2vec_tweet.model"
    wv_tweet = Word2VecFeaturizer(spark, model_path, False)
    # feat_df = wv_tweet.featurize(converted_df)
    # model_path = "../../param/word2vec/niconico_model/nico_vec.bin"
    # wv_nico = Word2VecFeaturizer(spark, model_path, False)
    # feat_df = wv_nico.featurize(converted_df)
    # model_path = "../../param/bert/Japanese_L-24_H-1024_A-16_E-30_BPE_WWM_transformers"
    # bert = BertFeaturizer(spark, model_path)
    # feat_df = bert.featurize(converted_df)
    converted_df2 = shape_df(spark, df, 'nagisa', ['補助記号']).drop("age")