Exemple #1
0
    def run(self):
        import boto3
        import matplotlib as mlp
        mlp.use('agg')
        from matplotlib import pyplot as plt
        import pandas_td as td
        from fbprophet import Prophet

        con = td.connect(apikey=self.apikey, endpoint=self.endpoint)

        engine = td.create_engine('presto:{}'.format(self.dbname), con=con)

        # Note: Prophet requires `ds` column as date string and `y` column as target value
        df = td.read_td(
            """
            select ds, y
            from {}
            where ds between '{}' and '{}'
            """.format(self.source_table, self.start, self.end), engine)

        model = Prophet(seasonality_mode='multiplicative', mcmc_samples=300)
        model.fit(df)
        future = model.make_future_dataframe(periods=self.period)
        forecast = model.predict(future)

        fig1 = model.plot(forecast)
        fig2 = model.plot_components(forecast)
        predict_fig_data = io.BytesIO()
        component_fig_data = io.BytesIO()
        fig1.savefig(predict_fig_data, format='png')
        fig2.savefig(component_fig_data, format='png')
        predict_fig_data.seek(0)
        component_fig_data.seek(0)

        # Upload figures to S3
        # boto3 assuming environment variables "AWS_ACCESS_KEY_ID" and "AWS_SECRET_ACCESS_KEY":
        # http://boto3.readthedocs.io/en/latest/guide/configuration.html#environment-variables
        s3 = boto3.resource('s3')

        predicted_fig_file = "predicted.png"
        component_fig_file = "component.png"

        # ACL should be chosen with your purpose
        s3.Object(os.environ['S3_BUCKET'],
                  predicted_fig_file).put(ACL='public-read',
                                          Body=predict_fig_data,
                                          ContentType='image/png')
        s3.Object(os.environ['S3_BUCKET'],
                  component_fig_file).put(ACL='public-read',
                                          Body=component_fig_data,
                                          ContentType='image/png')

        # To avoid TypeError: can't serialize Timestamp, convert `pandas._libs.tslibs.timestamps.Timestamp` to `str`
        forecast.ds = forecast.ds.apply(str)

        # Store prediction results
        td.to_td(forecast,
                 "{}.{}".format(self.dbname, self.target_table),
                 con,
                 if_exists='replace')
Exemple #2
0
def write_td_table(database_name, table_name):
    import pandas as pd
    import random
    # TODO TD client, check for table's existence
    engine = td.create_engine(f"presto:{database_name}", con=con)
    df = pd.DataFrame({"c": [random.random() for _ in range(20)]})

    # Manipulating data in Treasure Data via Python.
    # Uses https://github.com/treasure-data/td-client-python

    tdc = tdclient.Client(apikey=os.environ['TD_API_KEY'],
                          endpoint=os.environ['TD_API_SERVER'])

    try:
        tdc.create_database(database_name)
    except AlreadyExistsError:
        pass

    try:
        tdc.create_log_table(database_name, table_name)
    except AlreadyExistsError:
        pass

    table_path = f"{database_name}.{table_name}"
    td.to_td(df, table_path, con, if_exists='replace', index=False)
    def run(self, with_aws=True):
        import pandas_td as td
        from fbprophet import Prophet

        con = td.connect(apikey=self.apikey, endpoint=self.endpoint)

        engine = td.create_engine('presto:{}'.format(self.dbname), con=con)

        # Note: Prophet requires `ds` column as date string and `y` column as target value
        df = td.read_td(
            """
            select ds, y
            from {}
            where ds between '{}' and '{}'
            """.format(self.source_table, self.start, self.end), engine)

        model = Prophet(seasonality_mode='multiplicative', mcmc_samples=300)
        model.fit(df)
        future = model.make_future_dataframe(periods=self.period)
        forecast = model.predict(future)

        if with_aws:
            self._upload_graph(model, forecast)

        # To avoid TypeError: can't serialize Timestamp, convert `pandas._libs.tslibs.timestamps.Timestamp` to `str`
        forecast.ds = forecast.ds.apply(str)

        # Store prediction results
        td.to_td(forecast,
                 "{}.{}".format(self.dbname, self.target_table),
                 con,
                 if_exists='replace')
Exemple #4
0
    def jspca(self):
        os.system('pip install pandas')
        os.system('pip install scipy')
        os.system('pip install sklearn')
        os.system('pip install pandas-td')
        os.system('pip install pyyaml')

        from sklearn.decomposition import PCA
        import pandas as pd
        import pandas_td
        import yaml
        from scipy.spatial.distance import pdist, squareform
        from scipy.stats import entropy

        def _js(_P, _Q):
            _M = 0.5 * (_P + _Q)
            return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

        with open('config/params.yml') as f:
            params = yaml.load(f)

        apikey = os.environ.get("python_apikey")
        dbname = params['dbname']

        connection = pandas_td.connect(apikey=apikey)

        engine = pandas_td.create_engine('presto:{}'.format(dbname),
                                         con=connection)

        df = pandas_td.read_td(
            'select label, lambda from pca_input order by label asc', engine)

        pca = PCA(n_components=2, random_state=0)

        dist = []
        for index, row in df.iterrows():
            dist.append([0 if v is None else v for v in row['lambda'][2:]])

        dist_matrix = squareform(pdist(dist, metric=_js))

        result_df = pd.DataFrame(pca.fit_transform(dist_matrix),
                                 columns=['x', 'y'])

        pandas_td.to_td(result_df,
                        '{}.principal_component'.format(dbname),
                        connection,
                        if_exists='replace')
Exemple #5
0
 def build_engine(self, engine_type, database, args):
     ip = get_ipython()
     name = '{}:{}'.format(engine_type, database)
     code_args = [repr(name)]
     # connection
     if args.connection:
         con = ip.ev(args.connection)
         code_args.append('con={}'.format(args.connection))
     else:
         con = self.context.connect()
     # engine
     if args.quiet:
         params = {'show_progress': False, 'clear_progress': False}
     elif args.verbose:
         params = {'show_progress': True, 'clear_progress': False}
     else:
         params = {}
     code_args += ['{}={}'.format(k, v) for k, v in params.items()]
     self.push_code("_e = td.create_engine({})".format(', '.join(code_args)))
     return td.create_engine(name, con=con, **params)
Exemple #6
0
 def build_engine(self, engine_type, database, args):
     ip = get_ipython()
     name = "{}:{}".format(engine_type, database)
     code_args = [repr(name)]
     # connection
     if args.connection:
         con = ip.ev(args.connection)
         code_args.append("con={}".format(args.connection))
     else:
         con = self.context.connect()
     # engine
     if args.quiet:
         params = {"show_progress": False, "clear_progress": False}
     elif args.verbose:
         params = {"show_progress": True, "clear_progress": False}
     else:
         params = {}
     code_args += ["{}={}".format(k, v) for k, v in params.items()]
     self.push_code("_e = td.create_engine({})".format(", ".join(code_args)))
     return td.create_engine(name, con=con, **params)
Exemple #7
0
    def run(self):
        import pandas as pd
        import pandas_td as td
        from sklearn.ensemble import ExtraTreesRegressor
        from sklearn.feature_selection import SelectFromModel

        connection = td.connect(apikey=self.apikey, endpoint=self.endpoint)

        dbname = self.dbname
        source_table = self.source_table

        engine = td.create_engine('presto:{}'.format(dbname), con=connection)

        # Fetch 25% random sampled data
        df = td.read_td(
            """
            select *
            from {} tablesample bernoulli(25)
            """.format(source_table), engine)
        # You can use Hive instead:
        #
        # engine_hive = td.create_engine('hive:{}'.format(dbname), con=connection)
        # df = td.read_td(
        #     """
        #     select *
        #     from {}_train
        #     where rnd < 0.25
        #     """.format(source_table),
        #     engine_hive
        # )
        df = df.drop(columns=['time', 'v', 'rnd', 'rowid'], errors='ignore')

        y = df.medv
        X = df.drop(columns=['medv'])

        categorical_columns = set(['rad', 'chas'])
        quantitative_columns = set(X.columns) - categorical_columns

        reg = ExtraTreesRegressor()
        reg = reg.fit(X, y)

        feature_importances = pd.DataFrame({
            'column':
            X.columns,
            'importance':
            reg.feature_importances_
        })
        td.to_td(feature_importances,
                 'boston.feature_importances',
                 con=connection,
                 if_exists='replace',
                 index=False)

        model = SelectFromModel(reg, prefit=True)

        feature_idx = model.get_support()
        feature_name = df.drop(columns=['medv']).columns[feature_idx]
        selected_features = set(feature_name)

        categorical_columns = set(['rad', 'chas'])
        quantitative_columns = set(X.columns) - categorical_columns

        feature_types = {
            'categorical_columns': categorical_columns,
            'quantitative_columns': quantitative_columns
        }
        feature_query = self._feature_column_query(selected_features,
                                                   feature_types=feature_types)

        # Store query if possible
        try:
            import digdag
            digdag.env.store({'feature_query': feature_query})

        except ImportError:
            pass
Exemple #8
0
 def __init__(self, apikey, endpoint, database='sample_datasets'):
     self.conn = td.connect(apikey=apikey, endpoint=endpoint)
     self.database = database
     self.engine = td.create_engine('presto:{}'.format(database), self.conn)
Exemple #9
0
#!/usr/bin/python

import os
import sys
import pandas as pd
import pandas_td as td

print "load.py started"

con = td.connect(apikey="TD_APIKEY", endpoint='https://api.treasuredata.com')

# Type: Presto, Database: sample_datasets
engine = td.create_engine('presto:sample_datasets', con=con)

# Read Treasure Data query from into a DataFrame.
df = td.read_td_query('''
SELECT time, close FROM nasdaq LIMIT 100
''',
                      engine,
                      index_col='time',
                      parse_dates={'time': 's'})

print df.head

# Output DataFrame to TreasureData via Streaming Import. (If your dataset is large, this method is not recommended.)
td.to_td(df, 'workflow_temp.test_emr', con, if_exists='replace', index=False)

print "load.py finished"
Exemple #10
0
def run():
    # Original code is published at official document of TensorFlow under Apache License Version 2.0
    # https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub

    #os.system("pip install pandas-td tensorflow_hub boto3")

    import boto3
    import tensorflow as tf
    import tensorflow_hub as hub
    import pandas_td as td

    con = td.connect(apikey=os.environ['TD_API_KEY'],
                     endpoint=os.environ['TD_API_SERVER'])
    presto = td.create_engine('presto:sentiment', con=con)

    train_df = td.read_td(
        """
        select
            rowid, sentence, sentiment, polarity
        from
            movie_review_train_shuffled
    """, presto)

    test_df = td.read_td(
        """
        select
            rowid, sentence, sentiment, polarity
        from
            movie_review_test_shuffled    
    """, presto)

    # Shuffle has been done by HiveQL in the shuffle task
    # train_df = train_df.sample(frac=1).reset_index(drop=True)

    with tf.Session(graph=tf.Graph()) as sess:
        train_input_fn = tf.estimator.inputs.pandas_input_fn(
            train_df, train_df["polarity"], num_epochs=None, shuffle=True)

        embedded_text_feature_column = hub.text_embedding_column(
            key="sentence",
            module_spec="https://tfhub.dev/google/nnlm-en-dim128/1")

        estimator = tf.estimator.DNNClassifier(
            hidden_units=[500, 100],
            feature_columns=[embedded_text_feature_column],
            n_classes=2,
            optimizer=tf.train.AdamOptimizer(learning_rate=0.003))

        estimator.train(input_fn=train_input_fn, steps=1000)

        # Export TF model on S3
        feature_spec = tf.feature_column.make_parse_example_spec(
            [embedded_text_feature_column])
        serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
            feature_spec)
        estimator.export_saved_model(EXPORT_DIR_BASE,
                                     serving_input_receiver_fn)

        with tarfile.open('tfmodel.tar.gz', 'w:gz') as tar:
            tar.add(EXPORT_DIR_BASE, arcname=os.path.basename(EXPORT_DIR_BASE))

        # Upload the TensorFlow model to S3
        # boto3 assuming environment variables "AWS_ACCESS_KEY_ID" and "AWS_SECRET_ACCESS_KEY":
        # http://boto3.readthedocs.io/en/latest/guide/configuration.html#environment-variables
        s3 = boto3.resource('s3')
        # ACL should be chosen with your purpose
        s3.Bucket(os.environ['S3_BUCKET']).upload_file('tfmodel.tar.gz',
                                                       'tfmodel.tar.gz')

        predict_train_input_fn = tf.estimator.inputs.pandas_input_fn(
            train_df, train_df["polarity"], shuffle=False)

        predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(
            test_df, test_df["polarity"], shuffle=False)

        train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn)
        test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn)
        print("Training set accuracy: {accuracy}".format(**train_eval_result))
        print("Test set accuracy: {accuracy}".format(**test_eval_result))

        results = get_predictions(estimator, predict_test_input_fn)

    # Store prediction results to Treasure Data

    test_df['predicted_polarity'] = results

    td.to_td(test_df[['rowid', 'predicted_polarity']],
             'sentiment.test_predicted_polarities',
             con=con,
             if_exists='replace',
             index=False)
def run(with_aws=True):
    # Original code is published at official document of TensorFlow under Apache License Version 2.0
    # https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub

    import sys
    os.system(f"{sys.executable} -m pip install pandas-td")
    os.system(
        f"{sys.executable} -m pip install tensorflow==1.13.1 tensorflow_hub==0.1.1"
    )

    import tensorflow as tf
    import tensorflow_hub as hub
    import pandas_td as td

    con = td.connect(apikey=os.environ['TD_API_KEY'],
                     endpoint=os.environ['TD_API_SERVER'])
    presto = td.create_engine('presto:sentiment', con=con)

    train_df = td.read_td(
        """
        select
            rowid, sentence, sentiment, polarity
        from
            movie_review_train_shuffled
    """, presto)

    test_df = td.read_td(
        """
        select
            rowid, sentence, sentiment, polarity
        from
            movie_review_test_shuffled
    """, presto)

    # Shuffle has been done by HiveQL in the shuffle task
    # train_df = train_df.sample(frac=1).reset_index(drop=True)

    with tf.Session(graph=tf.Graph()) as sess:
        train_input_fn = tf.estimator.inputs.pandas_input_fn(
            train_df, train_df["polarity"], num_epochs=None, shuffle=True)

        embedded_text_feature_column = hub.text_embedding_column(
            key="sentence",
            module_spec="https://tfhub.dev/google/nnlm-en-dim128/1")

        estimator = tf.estimator.DNNClassifier(
            hidden_units=[500, 100],
            feature_columns=[embedded_text_feature_column],
            n_classes=2,
            optimizer=tf.train.AdamOptimizer(learning_rate=0.003))

        estimator.train(input_fn=train_input_fn, steps=1000)

        # Export TF model to S3
        if with_aws:
            _upload_model(embedded_text_feature_column, estimator)

        predict_train_input_fn = tf.estimator.inputs.pandas_input_fn(
            train_df, train_df["polarity"], shuffle=False)

        predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(
            test_df, test_df["polarity"], shuffle=False)

        train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn)
        test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn)
        print("Training set accuracy: {accuracy}".format(**train_eval_result))
        print("Test set accuracy: {accuracy}".format(**test_eval_result))

        results = get_predictions(estimator, predict_test_input_fn)

    # Store prediction results to Treasure Data

    test_df['predicted_polarity'] = results

    td.to_td(test_df[['rowid', 'predicted_polarity']],
             'sentiment.test_predicted_polarities',
             con=con,
             if_exists='replace',
             index=False)
Exemple #12
0
def read_td_table(database_name, table_name, engine_name='presto', limit=1000):
    engine = td.create_engine(f"{engine_name}:{database_name}", con=con)
    df = td.read_td(f'SELECT * FROM {table_name} LIMIT {limit}', engine)
    print(df)
Exemple #13
0
def heatmap_maker(val, id):
    #-----------------------------------  SQL処理  -------------------------------------------------
    #TDから直接座標データ吸い取り
    print('TDからデータ取り込み開始')
    # Initialize query engine
    engine = td.create_engine('presto:rails_events_production')

    # Read Treasure Data query into a DataFrame.
    sql = sql_selecter(val, id)  #全データ
    #sql = sq_selecter_with_daterange(val,id) #期間指定
    df = td.read_td(sql, engine)
    lines = df.values.tolist()
    print('データ取り込み完了')

    #読み込み画像データ設定
    line = lines[0]
    file = str(int(line[1]))  #item_id
    media_id = str(int(line[0]))  #media_id

    print('item_id:{}/media_id:{}'.format(file, media_id))

    #------------------------------ データ作成 -------------------------------------
    #座標補正処理 Ver02
    axis_data_lst = []
    f_lst = []
    idx = 0
    for line in lines:
        data = corrected_data(line)  #座標補正
        axis_data_lst.append(data)
    print('座標補正完了')

    #ヒートマップデータ作成Ver02
    lst_2d = data_maker(*axis_data_lst)
    print('ヒートマップデータ作成完了')

    #---------------------------------- ヒートマップ作成 ---------------------------------------------
    # vのレンジ幅の調整
    v_max = v_max_set(*lst_2d)
    #print('vmax:{}'.format(v_max))

    #凡例付きヒートマップ作成
    hm_name_sample = "./tmp/hm_org.jpg"
    plt.figure(figsize=(40, 16))
    #fig, ax = plt.subplots(figsize=(32,16))
    plt.tick_params(labelsize=30)
    sns.heatmap(lst_2d, vmin=0, vmax=v_max)
    #sns.heatmap(lst_2d,vmin=0, vmax=5)
    plt.savefig(hm_name_sample)
    print('凡例付きヒートマップ作成完了')

    #合成用ヒートマップ作成
    hm_name = "./tmp/hm.jpg"  #<- dirをつける
    plt.figure(figsize=(32, 16))
    sns.heatmap(lst_2d,
                vmin=0,
                vmax=v_max,
                yticklabels=False,
                xticklabels=False,
                cbar=False)
    plt.savefig(hm_name)
    print('合成用ヒートマップ作成完了')

    #HM余白削除
    outfile = './tmp/hm_edge.jpg'
    #outfile = margin_cut(hm_name,outfile)
    margin_cut(hm_name, outfile)
    print('余白削除処理完了')

    #------------------------------------- GCSからの画像データ読み込み -------------------------------------------------
    #直接画像読み込み
    #os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/tani_kyuichiro/nurve-cloud-98b3f-bb7c97f8fb03.json'

    #ファイルの存在確認、拡張子取得
    bucketname = 'rent-production'
    prefix = 'medium_items/media/' + media_id + '/'
    ext = extension_get(bucketname, prefix)  #拡張子吸い出し
    if not ext:
        #exit()
        return False  #画像ファイルが無ければ終了

    #bucket_name = 'rent-production/medium_items/media/'+media_id
    bucket_name = 'rent-production'
    source_blob_name = 'medium_items/media/' + media_id + '/2048x1024.' + ext
    file_name = './imgdata/2048x1024.jpg'
    try:
        download_blob(bucket_name, source_blob_name, file_name)
    except:
        print('file not found. {}'.format(source_blob_name))

    print('オリジナル画像読み込み完了')

    #----------------------------------------- 画像処理 ------------------------------------------------------------
    #画像データ読み込み
    filename = './imgdata/2048x1024.jpg'
    try:
        img = cv2.imread(filename)
        height = img.shape[0]  # Errorを引っ掛けるためだけの仕掛
    except AttributeError:
        print('file not found. {}'.format(filename))
        return False

    #サイズが異なる画像が登録されていた場合処理を閉める
    if img.shape[0] == 1024 and img.shape[1] == 2048:
        print('img:{}'.format(img.shape))
    else:
        print('size is diff!')
        return False

    #余白除去後HM読み込み
    filename = './tmp/hm_edge.jpg'  # <- dirをつける
    try:
        img2 = cv2.imread(filename)
        height = img2.shape[0]  # Errorを引っ掛けるためだけの仕掛
    except AttributeError:
        print('file not found.')
        return False

    #サイズ調整
    height = img2.shape[0]
    width = img2.shape[1]

    #img2_1 = cv2.resize(img2 , (int(width*(2048.5/width)), int(height*(1024/height))))
    img2_1 = cv2.resize(
        img2, (int(width * (2048 / width)), int(height * (1024 / height))))
    print('img2_1:{}'.format(img2_1.shape))
    print('ヒートマップのサイズ調整完了')

    #画像合成
    blended = cv2.addWeighted(src1=img,
                              alpha=0.6,
                              src2=img2_1,
                              beta=0.4,
                              gamma=0.3)

    #合成後画像保存
    cv2.imwrite('./tmp/blended_test.jpg', blended)  # <- dirをつける
    print('合成画像作成完了')

    #凡例切り出し準備
    im = Image.open(hm_name_sample)

    #凡例切り出し&resize
    #im_crop = im.crop((2230,120,2350,1030))
    im_crop = im.crop((3150, 170, 3250, 1450))  #2020/04/16 凡例切り出しのズレが生じていたので修正
    im_crop_rsize = im_crop.resize((100, 1024))
    plt.imshow(im_crop_rsize)
    im_crop_rsize.save('./tmp/colorbar_crop.jpg', quality=100)  # <- dirをつける
    print('凡例作成完了')

    #ブランク画像作成
    height = 1024
    width = 2200
    blank = np.zeros((height, width, 3))
    blank += 255  #←全ゼロデータに255を足してホワイトにする

    cv2.imwrite('./tmp/blank.jpg', blank)  # <- dirをつける
    print('ベース画像作成完了')

    #凡例付きヒートマップ作成
    img0 = cv2.imread('./tmp/blank.jpg')  # ブランク画像
    img1 = cv2.imread('./tmp/blended_test.jpg')  # ヒートマップ合成画像
    img2 = cv2.imread('./tmp/colorbar_crop.jpg')  # 凡例

    img0 = cv2.cvtColor(img0, cv2.COLOR_BGR2RGB)
    img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)
    img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2RGB)

    base_img = img0

    #ヒートマップ画像貼り付け基準点設定
    x_offset = 0
    y_offset = 0

    #ヒートマップ画像合成
    base_img[y_offset:y_offset + img1.shape[0],
             x_offset:x_offset + img1.shape[1]] = img1

    #凡例画像貼り付け基準点設定
    #x_offset=2050
    x_offset = 2100
    y_offset = 0

    #凡例画像合成
    base_img[y_offset:y_offset + img2.shape[0],
             x_offset:x_offset + img2.shape[1]] = img2
    base_img = cv2.cvtColor(base_img, cv2.COLOR_BGR2RGB)

    media_dir = './medium_items/media/' + media_id

    if not os.path.exists(media_dir):
        os.makedirs(media_dir)  #dirが無い場合作成

    dir_file_name = './medium_items/media/' + media_id + '/heatmap.jpg'

    cv2.imwrite(dir_file_name, base_img)
    print('凡例付き合成画像作成完了')

    #----------------------------------------- GCSへのアップロード -----------------------------------------------
    # Create a storage client.

    #os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/tani_kyuichiro/nurve-cloud-98b3f-bb7c97f8fb03.json'

    storage_client = google.cloud.storage.Client()

    bucket_name = 'rent-heatmap'
    bucket = storage_client.get_bucket(bucket_name)

    source_file_name = 'medium_items/media/' + media_id + '/heatmap.jpg'
    blob = bucket.blob(source_file_name)

    # Upload the local file to Cloud Storage.
    blob.upload_from_filename(source_file_name)

    print('File {} uploaded to {}.'.format(source_file_name, bucket))
    print('GCSへのアップロード完了')

    #---------------------------------------------- ディレクトリ削除 -----------------------------------------------
    shutil.rmtree('./medium_items/media/' + media_id)
    print('dir削除')

    return True