def _create_vectorize_table(self, engine, dbname, table_name, source_table, feature_query): import tdclient import pandas_td as td # Create feature vector table with tdclient.Client(apikey=self.apikey, endpoint=self.endpoint) as client: db = client.database(dbname) try: db.table(table_name) db.table(table_name).delete() except tdclient.api.NotFoundError as e: pass db.create_log_table(table_name) hql = '''insert overwrite table {output_table} select rowid, {target_columns}, medv as price from {source_table} '''.format_map({ 'output_table': table_name, 'source_table': source_table, 'target_columns': textwrap.indent(feature_query, ' ') }) td.read_td(hql, engine)
def run(self): import boto3 import matplotlib as mlp mlp.use('agg') from matplotlib import pyplot as plt import pandas_td as td from fbprophet import Prophet con = td.connect(apikey=self.apikey, endpoint=self.endpoint) engine = td.create_engine('presto:{}'.format(self.dbname), con=con) # Note: Prophet requires `ds` column as date string and `y` column as target value df = td.read_td( """ select ds, y from {} where ds between '{}' and '{}' """.format(self.source_table, self.start, self.end), engine) model = Prophet(seasonality_mode='multiplicative', mcmc_samples=300) model.fit(df) future = model.make_future_dataframe(periods=self.period) forecast = model.predict(future) fig1 = model.plot(forecast) fig2 = model.plot_components(forecast) predict_fig_data = io.BytesIO() component_fig_data = io.BytesIO() fig1.savefig(predict_fig_data, format='png') fig2.savefig(component_fig_data, format='png') predict_fig_data.seek(0) component_fig_data.seek(0) # Upload figures to S3 # boto3 assuming environment variables "AWS_ACCESS_KEY_ID" and "AWS_SECRET_ACCESS_KEY": # http://boto3.readthedocs.io/en/latest/guide/configuration.html#environment-variables s3 = boto3.resource('s3') predicted_fig_file = "predicted.png" component_fig_file = "component.png" # ACL should be chosen with your purpose s3.Object(os.environ['S3_BUCKET'], predicted_fig_file).put(ACL='public-read', Body=predict_fig_data, ContentType='image/png') s3.Object(os.environ['S3_BUCKET'], component_fig_file).put(ACL='public-read', Body=component_fig_data, ContentType='image/png') # To avoid TypeError: can't serialize Timestamp, convert `pandas._libs.tslibs.timestamps.Timestamp` to `str` forecast.ds = forecast.ds.apply(str) # Store prediction results td.to_td(forecast, "{}.{}".format(self.dbname, self.target_table), con, if_exists='replace')
def run(self, with_aws=True): import pandas_td as td from fbprophet import Prophet con = td.connect(apikey=self.apikey, endpoint=self.endpoint) engine = td.create_engine('presto:{}'.format(self.dbname), con=con) # Note: Prophet requires `ds` column as date string and `y` column as target value df = td.read_td( """ select ds, y from {} where ds between '{}' and '{}' """.format(self.source_table, self.start, self.end), engine) model = Prophet(seasonality_mode='multiplicative', mcmc_samples=300) model.fit(df) future = model.make_future_dataframe(periods=self.period) forecast = model.predict(future) if with_aws: self._upload_graph(model, forecast) # To avoid TypeError: can't serialize Timestamp, convert `pandas._libs.tslibs.timestamps.Timestamp` to `str` forecast.ds = forecast.ds.apply(str) # Store prediction results td.to_td(forecast, "{}.{}".format(self.dbname, self.target_table), con, if_exists='replace')
def jspca(self): os.system('pip install pandas') os.system('pip install scipy') os.system('pip install sklearn') os.system('pip install pandas-td') os.system('pip install pyyaml') from sklearn.decomposition import PCA import pandas as pd import pandas_td import yaml from scipy.spatial.distance import pdist, squareform from scipy.stats import entropy def _js(_P, _Q): _M = 0.5 * (_P + _Q) return 0.5 * (entropy(_P, _M) + entropy(_Q, _M)) with open('config/params.yml') as f: params = yaml.load(f) apikey = os.environ.get("python_apikey") dbname = params['dbname'] connection = pandas_td.connect(apikey=apikey) engine = pandas_td.create_engine('presto:{}'.format(dbname), con=connection) df = pandas_td.read_td( 'select label, lambda from pca_input order by label asc', engine) pca = PCA(n_components=2, random_state=0) dist = [] for index, row in df.iterrows(): dist.append([0 if v is None else v for v in row['lambda'][2:]]) dist_matrix = squareform(pdist(dist, metric=_js)) result_df = pd.DataFrame(pca.fit_transform(dist_matrix), columns=['x', 'y']) pandas_td.to_td(result_df, '{}.principal_component'.format(dbname), connection, if_exists='replace')
def run(self): import pandas as pd import pandas_td as td from sklearn.ensemble import ExtraTreesRegressor from sklearn.feature_selection import SelectFromModel connection = td.connect(apikey=self.apikey, endpoint=self.endpoint) dbname = self.dbname source_table = self.source_table engine = td.create_engine('presto:{}'.format(dbname), con=connection) # Fetch 25% random sampled data df = td.read_td( """ select * from {} tablesample bernoulli(25) """.format(source_table), engine) # You can use Hive instead: # # engine_hive = td.create_engine('hive:{}'.format(dbname), con=connection) # df = td.read_td( # """ # select * # from {}_train # where rnd < 0.25 # """.format(source_table), # engine_hive # ) df = df.drop(columns=['time', 'v', 'rnd', 'rowid'], errors='ignore') y = df.medv X = df.drop(columns=['medv']) categorical_columns = set(['rad', 'chas']) quantitative_columns = set(X.columns) - categorical_columns reg = ExtraTreesRegressor() reg = reg.fit(X, y) feature_importances = pd.DataFrame({ 'column': X.columns, 'importance': reg.feature_importances_ }) td.to_td(feature_importances, 'boston.feature_importances', con=connection, if_exists='replace', index=False) model = SelectFromModel(reg, prefit=True) feature_idx = model.get_support() feature_name = df.drop(columns=['medv']).columns[feature_idx] selected_features = set(feature_name) categorical_columns = set(['rad', 'chas']) quantitative_columns = set(X.columns) - categorical_columns feature_types = { 'categorical_columns': categorical_columns, 'quantitative_columns': quantitative_columns } feature_query = self._feature_column_query(selected_features, feature_types=feature_types) # Store query if possible try: import digdag digdag.env.store({'feature_query': feature_query}) except ImportError: pass
def run(): # Original code is published at official document of TensorFlow under Apache License Version 2.0 # https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub #os.system("pip install pandas-td tensorflow_hub boto3") import boto3 import tensorflow as tf import tensorflow_hub as hub import pandas_td as td con = td.connect(apikey=os.environ['TD_API_KEY'], endpoint=os.environ['TD_API_SERVER']) presto = td.create_engine('presto:sentiment', con=con) train_df = td.read_td( """ select rowid, sentence, sentiment, polarity from movie_review_train_shuffled """, presto) test_df = td.read_td( """ select rowid, sentence, sentiment, polarity from movie_review_test_shuffled """, presto) # Shuffle has been done by HiveQL in the shuffle task # train_df = train_df.sample(frac=1).reset_index(drop=True) with tf.Session(graph=tf.Graph()) as sess: train_input_fn = tf.estimator.inputs.pandas_input_fn( train_df, train_df["polarity"], num_epochs=None, shuffle=True) embedded_text_feature_column = hub.text_embedding_column( key="sentence", module_spec="https://tfhub.dev/google/nnlm-en-dim128/1") estimator = tf.estimator.DNNClassifier( hidden_units=[500, 100], feature_columns=[embedded_text_feature_column], n_classes=2, optimizer=tf.train.AdamOptimizer(learning_rate=0.003)) estimator.train(input_fn=train_input_fn, steps=1000) # Export TF model on S3 feature_spec = tf.feature_column.make_parse_example_spec( [embedded_text_feature_column]) serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn( feature_spec) estimator.export_saved_model(EXPORT_DIR_BASE, serving_input_receiver_fn) with tarfile.open('tfmodel.tar.gz', 'w:gz') as tar: tar.add(EXPORT_DIR_BASE, arcname=os.path.basename(EXPORT_DIR_BASE)) # Upload the TensorFlow model to S3 # boto3 assuming environment variables "AWS_ACCESS_KEY_ID" and "AWS_SECRET_ACCESS_KEY": # http://boto3.readthedocs.io/en/latest/guide/configuration.html#environment-variables s3 = boto3.resource('s3') # ACL should be chosen with your purpose s3.Bucket(os.environ['S3_BUCKET']).upload_file('tfmodel.tar.gz', 'tfmodel.tar.gz') predict_train_input_fn = tf.estimator.inputs.pandas_input_fn( train_df, train_df["polarity"], shuffle=False) predict_test_input_fn = tf.estimator.inputs.pandas_input_fn( test_df, test_df["polarity"], shuffle=False) train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn) test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn) print("Training set accuracy: {accuracy}".format(**train_eval_result)) print("Test set accuracy: {accuracy}".format(**test_eval_result)) results = get_predictions(estimator, predict_test_input_fn) # Store prediction results to Treasure Data test_df['predicted_polarity'] = results td.to_td(test_df[['rowid', 'predicted_polarity']], 'sentiment.test_predicted_polarities', con=con, if_exists='replace', index=False)
def run(with_aws=True): # Original code is published at official document of TensorFlow under Apache License Version 2.0 # https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub import sys os.system(f"{sys.executable} -m pip install pandas-td") os.system( f"{sys.executable} -m pip install tensorflow==1.13.1 tensorflow_hub==0.1.1" ) import tensorflow as tf import tensorflow_hub as hub import pandas_td as td con = td.connect(apikey=os.environ['TD_API_KEY'], endpoint=os.environ['TD_API_SERVER']) presto = td.create_engine('presto:sentiment', con=con) train_df = td.read_td( """ select rowid, sentence, sentiment, polarity from movie_review_train_shuffled """, presto) test_df = td.read_td( """ select rowid, sentence, sentiment, polarity from movie_review_test_shuffled """, presto) # Shuffle has been done by HiveQL in the shuffle task # train_df = train_df.sample(frac=1).reset_index(drop=True) with tf.Session(graph=tf.Graph()) as sess: train_input_fn = tf.estimator.inputs.pandas_input_fn( train_df, train_df["polarity"], num_epochs=None, shuffle=True) embedded_text_feature_column = hub.text_embedding_column( key="sentence", module_spec="https://tfhub.dev/google/nnlm-en-dim128/1") estimator = tf.estimator.DNNClassifier( hidden_units=[500, 100], feature_columns=[embedded_text_feature_column], n_classes=2, optimizer=tf.train.AdamOptimizer(learning_rate=0.003)) estimator.train(input_fn=train_input_fn, steps=1000) # Export TF model to S3 if with_aws: _upload_model(embedded_text_feature_column, estimator) predict_train_input_fn = tf.estimator.inputs.pandas_input_fn( train_df, train_df["polarity"], shuffle=False) predict_test_input_fn = tf.estimator.inputs.pandas_input_fn( test_df, test_df["polarity"], shuffle=False) train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn) test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn) print("Training set accuracy: {accuracy}".format(**train_eval_result)) print("Test set accuracy: {accuracy}".format(**test_eval_result)) results = get_predictions(estimator, predict_test_input_fn) # Store prediction results to Treasure Data test_df['predicted_polarity'] = results td.to_td(test_df[['rowid', 'predicted_polarity']], 'sentiment.test_predicted_polarities', con=con, if_exists='replace', index=False)
def read_td_table(database_name, table_name, engine_name='presto', limit=1000): engine = td.create_engine(f"{engine_name}:{database_name}", con=con) df = td.read_td(f'SELECT * FROM {table_name} LIMIT {limit}', engine) print(df)
def heatmap_maker(val, id): #----------------------------------- SQL処理 ------------------------------------------------- #TDから直接座標データ吸い取り print('TDからデータ取り込み開始') # Initialize query engine engine = td.create_engine('presto:rails_events_production') # Read Treasure Data query into a DataFrame. sql = sql_selecter(val, id) #全データ #sql = sq_selecter_with_daterange(val,id) #期間指定 df = td.read_td(sql, engine) lines = df.values.tolist() print('データ取り込み完了') #読み込み画像データ設定 line = lines[0] file = str(int(line[1])) #item_id media_id = str(int(line[0])) #media_id print('item_id:{}/media_id:{}'.format(file, media_id)) #------------------------------ データ作成 ------------------------------------- #座標補正処理 Ver02 axis_data_lst = [] f_lst = [] idx = 0 for line in lines: data = corrected_data(line) #座標補正 axis_data_lst.append(data) print('座標補正完了') #ヒートマップデータ作成Ver02 lst_2d = data_maker(*axis_data_lst) print('ヒートマップデータ作成完了') #---------------------------------- ヒートマップ作成 --------------------------------------------- # vのレンジ幅の調整 v_max = v_max_set(*lst_2d) #print('vmax:{}'.format(v_max)) #凡例付きヒートマップ作成 hm_name_sample = "./tmp/hm_org.jpg" plt.figure(figsize=(40, 16)) #fig, ax = plt.subplots(figsize=(32,16)) plt.tick_params(labelsize=30) sns.heatmap(lst_2d, vmin=0, vmax=v_max) #sns.heatmap(lst_2d,vmin=0, vmax=5) plt.savefig(hm_name_sample) print('凡例付きヒートマップ作成完了') #合成用ヒートマップ作成 hm_name = "./tmp/hm.jpg" #<- dirをつける plt.figure(figsize=(32, 16)) sns.heatmap(lst_2d, vmin=0, vmax=v_max, yticklabels=False, xticklabels=False, cbar=False) plt.savefig(hm_name) print('合成用ヒートマップ作成完了') #HM余白削除 outfile = './tmp/hm_edge.jpg' #outfile = margin_cut(hm_name,outfile) margin_cut(hm_name, outfile) print('余白削除処理完了') #------------------------------------- GCSからの画像データ読み込み ------------------------------------------------- #直接画像読み込み #os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/tani_kyuichiro/nurve-cloud-98b3f-bb7c97f8fb03.json' #ファイルの存在確認、拡張子取得 bucketname = 'rent-production' prefix = 'medium_items/media/' + media_id + '/' ext = extension_get(bucketname, prefix) #拡張子吸い出し if not ext: #exit() return False #画像ファイルが無ければ終了 #bucket_name = 'rent-production/medium_items/media/'+media_id bucket_name = 'rent-production' source_blob_name = 'medium_items/media/' + media_id + '/2048x1024.' + ext file_name = './imgdata/2048x1024.jpg' try: download_blob(bucket_name, source_blob_name, file_name) except: print('file not found. {}'.format(source_blob_name)) print('オリジナル画像読み込み完了') #----------------------------------------- 画像処理 ------------------------------------------------------------ #画像データ読み込み filename = './imgdata/2048x1024.jpg' try: img = cv2.imread(filename) height = img.shape[0] # Errorを引っ掛けるためだけの仕掛 except AttributeError: print('file not found. {}'.format(filename)) return False #サイズが異なる画像が登録されていた場合処理を閉める if img.shape[0] == 1024 and img.shape[1] == 2048: print('img:{}'.format(img.shape)) else: print('size is diff!') return False #余白除去後HM読み込み filename = './tmp/hm_edge.jpg' # <- dirをつける try: img2 = cv2.imread(filename) height = img2.shape[0] # Errorを引っ掛けるためだけの仕掛 except AttributeError: print('file not found.') return False #サイズ調整 height = img2.shape[0] width = img2.shape[1] #img2_1 = cv2.resize(img2 , (int(width*(2048.5/width)), int(height*(1024/height)))) img2_1 = cv2.resize( img2, (int(width * (2048 / width)), int(height * (1024 / height)))) print('img2_1:{}'.format(img2_1.shape)) print('ヒートマップのサイズ調整完了') #画像合成 blended = cv2.addWeighted(src1=img, alpha=0.6, src2=img2_1, beta=0.4, gamma=0.3) #合成後画像保存 cv2.imwrite('./tmp/blended_test.jpg', blended) # <- dirをつける print('合成画像作成完了') #凡例切り出し準備 im = Image.open(hm_name_sample) #凡例切り出し&resize #im_crop = im.crop((2230,120,2350,1030)) im_crop = im.crop((3150, 170, 3250, 1450)) #2020/04/16 凡例切り出しのズレが生じていたので修正 im_crop_rsize = im_crop.resize((100, 1024)) plt.imshow(im_crop_rsize) im_crop_rsize.save('./tmp/colorbar_crop.jpg', quality=100) # <- dirをつける print('凡例作成完了') #ブランク画像作成 height = 1024 width = 2200 blank = np.zeros((height, width, 3)) blank += 255 #←全ゼロデータに255を足してホワイトにする cv2.imwrite('./tmp/blank.jpg', blank) # <- dirをつける print('ベース画像作成完了') #凡例付きヒートマップ作成 img0 = cv2.imread('./tmp/blank.jpg') # ブランク画像 img1 = cv2.imread('./tmp/blended_test.jpg') # ヒートマップ合成画像 img2 = cv2.imread('./tmp/colorbar_crop.jpg') # 凡例 img0 = cv2.cvtColor(img0, cv2.COLOR_BGR2RGB) img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB) img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2RGB) base_img = img0 #ヒートマップ画像貼り付け基準点設定 x_offset = 0 y_offset = 0 #ヒートマップ画像合成 base_img[y_offset:y_offset + img1.shape[0], x_offset:x_offset + img1.shape[1]] = img1 #凡例画像貼り付け基準点設定 #x_offset=2050 x_offset = 2100 y_offset = 0 #凡例画像合成 base_img[y_offset:y_offset + img2.shape[0], x_offset:x_offset + img2.shape[1]] = img2 base_img = cv2.cvtColor(base_img, cv2.COLOR_BGR2RGB) media_dir = './medium_items/media/' + media_id if not os.path.exists(media_dir): os.makedirs(media_dir) #dirが無い場合作成 dir_file_name = './medium_items/media/' + media_id + '/heatmap.jpg' cv2.imwrite(dir_file_name, base_img) print('凡例付き合成画像作成完了') #----------------------------------------- GCSへのアップロード ----------------------------------------------- # Create a storage client. #os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/tani_kyuichiro/nurve-cloud-98b3f-bb7c97f8fb03.json' storage_client = google.cloud.storage.Client() bucket_name = 'rent-heatmap' bucket = storage_client.get_bucket(bucket_name) source_file_name = 'medium_items/media/' + media_id + '/heatmap.jpg' blob = bucket.blob(source_file_name) # Upload the local file to Cloud Storage. blob.upload_from_filename(source_file_name) print('File {} uploaded to {}.'.format(source_file_name, bucket)) print('GCSへのアップロード完了') #---------------------------------------------- ディレクトリ削除 ----------------------------------------------- shutil.rmtree('./medium_items/media/' + media_id) print('dir削除') return True