def test_append_if_exists(self): client = self.connection.client # first call to_td(self.frame, 'test_db.test_table', self.connection, if_exists='append') # second call to_td(self.frame, 'test_db.test_table', self.connection, if_exists='append') client.create_log_table.assert_called_once_with('test_db', 'test_table')
def run(self): import boto3 import matplotlib as mlp mlp.use('agg') from matplotlib import pyplot as plt import pandas_td as td from fbprophet import Prophet con = td.connect(apikey=self.apikey, endpoint=self.endpoint) engine = td.create_engine('presto:{}'.format(self.dbname), con=con) # Note: Prophet requires `ds` column as date string and `y` column as target value df = td.read_td( """ select ds, y from {} where ds between '{}' and '{}' """.format(self.source_table, self.start, self.end), engine) model = Prophet(seasonality_mode='multiplicative', mcmc_samples=300) model.fit(df) future = model.make_future_dataframe(periods=self.period) forecast = model.predict(future) fig1 = model.plot(forecast) fig2 = model.plot_components(forecast) predict_fig_data = io.BytesIO() component_fig_data = io.BytesIO() fig1.savefig(predict_fig_data, format='png') fig2.savefig(component_fig_data, format='png') predict_fig_data.seek(0) component_fig_data.seek(0) # Upload figures to S3 # boto3 assuming environment variables "AWS_ACCESS_KEY_ID" and "AWS_SECRET_ACCESS_KEY": # http://boto3.readthedocs.io/en/latest/guide/configuration.html#environment-variables s3 = boto3.resource('s3') predicted_fig_file = "predicted.png" component_fig_file = "component.png" # ACL should be chosen with your purpose s3.Object(os.environ['S3_BUCKET'], predicted_fig_file).put(ACL='public-read', Body=predict_fig_data, ContentType='image/png') s3.Object(os.environ['S3_BUCKET'], component_fig_file).put(ACL='public-read', Body=component_fig_data, ContentType='image/png') # To avoid TypeError: can't serialize Timestamp, convert `pandas._libs.tslibs.timestamps.Timestamp` to `str` forecast.ds = forecast.ds.apply(str) # Store prediction results td.to_td(forecast, "{}.{}".format(self.dbname, self.target_table), con, if_exists='replace')
def run(self, with_aws=True): import pandas_td as td from fbprophet import Prophet con = td.connect(apikey=self.apikey, endpoint=self.endpoint) engine = td.create_engine('presto:{}'.format(self.dbname), con=con) # Note: Prophet requires `ds` column as date string and `y` column as target value df = td.read_td( """ select ds, y from {} where ds between '{}' and '{}' """.format(self.source_table, self.start, self.end), engine) model = Prophet(seasonality_mode='multiplicative', mcmc_samples=300) model.fit(df) future = model.make_future_dataframe(periods=self.period) forecast = model.predict(future) if with_aws: self._upload_graph(model, forecast) # To avoid TypeError: can't serialize Timestamp, convert `pandas._libs.tslibs.timestamps.Timestamp` to `str` forecast.ds = forecast.ds.apply(str) # Store prediction results td.to_td(forecast, "{}.{}".format(self.dbname, self.target_table), con, if_exists='replace')
def write_td_table(database_name, table_name): import pandas as pd import random # TODO TD client, check for table's existence engine = td.create_engine(f"presto:{database_name}", con=con) df = pd.DataFrame({"c": [random.random() for _ in range(20)]}) # Manipulating data in Treasure Data via Python. # Uses https://github.com/treasure-data/td-client-python tdc = tdclient.Client(apikey=os.environ['TD_API_KEY'], endpoint=os.environ['TD_API_SERVER']) try: tdc.create_database(database_name) except AlreadyExistsError: pass try: tdc.create_log_table(database_name, table_name) except AlreadyExistsError: pass table_path = f"{database_name}.{table_name}" td.to_td(df, table_path, con, if_exists='replace', index=False)
def test_append_if_exists(self): client = self.connection.client # first call to_td(self.frame, "test_db.test_table", self.connection, if_exists="append") # second call to_td(self.frame, "test_db.test_table", self.connection, if_exists="append") client.create_log_table.assert_called_once_with("test_db", "test_table")
def test_datetime_is_not_supported(self): # mock client = self.connection.client client.table = MagicMock(side_effect=tdclient.api.NotFoundError('test_table')) client.create_log_table = MagicMock() client.import_data = MagicMock() # test frame = pd.DataFrame({'timestamp': [datetime.datetime(2000,1,1)]}) to_td(frame, 'test_db.test_table', self.connection)
def test_replace_if_exists(self): client = self.connection.client # first call to_td(self.frame, "test_db.test_table", self.connection, if_exists="replace") client.create_log_table.assert_called_with("test_db", "test_table") # second call to_td(self.frame, "test_db.test_table", self.connection, if_exists="replace") client.delete_table.assert_called_with("test_db", "test_table") client.create_log_table.assert_called_with("test_db", "test_table")
def test_replace_if_exists(self): client = self.connection.client # first call to_td(self.frame, 'test_db.test_table', self.connection, if_exists='replace') client.create_log_table.assert_called_with('test_db', 'test_table') # second call to_td(self.frame, 'test_db.test_table', self.connection, if_exists='replace') client.delete_table.assert_called_with('test_db', 'test_table') client.create_log_table.assert_called_with('test_db', 'test_table')
def test_ok_if_not_exists(self): # mock client = self.connection.client client.table = MagicMock(side_effect=tdclient.api.NotFoundError('test_table')) client.create_log_table = MagicMock() client.import_data = MagicMock() # test to_td(self.frame, 'test_db.test_table', self.connection) client.table.assert_called_with('test_db', 'test_table') client.create_log_table.assert_called_with('test_db', 'test_table')
def test_append_if_exists(self): # mock client = self.connection.client client.table = MagicMock(side_effect=tdclient.api.NotFoundError('test_table')) client.create_log_table = MagicMock() client.import_data = MagicMock() # first call to_td(self.frame, 'test_db.test_table', self.connection, if_exists='append') # second call client.table = MagicMock() to_td(self.frame, 'test_db.test_table', self.connection, if_exists='append') client.create_log_table.assert_called_once_with('test_db', 'test_table')
def test_append_if_exists(self): client = self.connection.client # first call to_td(self.frame, 'test_db.test_table', self.connection, if_exists='append') # second call to_td(self.frame, 'test_db.test_table', self.connection, if_exists='append') client.create_log_table.assert_called_once_with( 'test_db', 'test_table')
def jspca(self): os.system('pip install pandas') os.system('pip install scipy') os.system('pip install sklearn') os.system('pip install pandas-td') os.system('pip install pyyaml') from sklearn.decomposition import PCA import pandas as pd import pandas_td import yaml from scipy.spatial.distance import pdist, squareform from scipy.stats import entropy def _js(_P, _Q): _M = 0.5 * (_P + _Q) return 0.5 * (entropy(_P, _M) + entropy(_Q, _M)) with open('config/params.yml') as f: params = yaml.load(f) apikey = os.environ.get("python_apikey") dbname = params['dbname'] connection = pandas_td.connect(apikey=apikey) engine = pandas_td.create_engine('presto:{}'.format(dbname), con=connection) df = pandas_td.read_td( 'select label, lambda from pca_input order by label asc', engine) pca = PCA(n_components=2, random_state=0) dist = [] for index, row in df.iterrows(): dist.append([0 if v is None else v for v in row['lambda'][2:]]) dist_matrix = squareform(pdist(dist, metric=_js)) result_df = pd.DataFrame(pca.fit_transform(dist_matrix), columns=['x', 'y']) pandas_td.to_td(result_df, '{}.principal_component'.format(dbname), connection, if_exists='replace')
def test_invalid_if_exists(self): with self.assertRaises(ValueError): to_td( self.frame, "test_db.test_table", self.connection, if_exists="invalid" )
def test_invalid_table_name(self): with self.assertRaises(ValueError): to_td(self.frame, "invalid", self.connection)
def test_datetime_is_not_supported(self): client = self.connection.client # test frame = pd.DataFrame({"timestamp": [datetime.datetime(2000, 1, 1)]}) with self.assertRaises(TypeError): to_td(frame, "test_db.test_table", self.connection)
def import_frame(self, frame, table): td.to_td(frame, self.database + '.' + table, self.conn, if_exists='replace', index=False)
def run(self): import pandas as pd import pandas_td as td from sklearn.ensemble import ExtraTreesRegressor from sklearn.feature_selection import SelectFromModel connection = td.connect(apikey=self.apikey, endpoint=self.endpoint) dbname = self.dbname source_table = self.source_table engine = td.create_engine('presto:{}'.format(dbname), con=connection) # Fetch 25% random sampled data df = td.read_td( """ select * from {} tablesample bernoulli(25) """.format(source_table), engine) # You can use Hive instead: # # engine_hive = td.create_engine('hive:{}'.format(dbname), con=connection) # df = td.read_td( # """ # select * # from {}_train # where rnd < 0.25 # """.format(source_table), # engine_hive # ) df = df.drop(columns=['time', 'v', 'rnd', 'rowid'], errors='ignore') y = df.medv X = df.drop(columns=['medv']) categorical_columns = set(['rad', 'chas']) quantitative_columns = set(X.columns) - categorical_columns reg = ExtraTreesRegressor() reg = reg.fit(X, y) feature_importances = pd.DataFrame({ 'column': X.columns, 'importance': reg.feature_importances_ }) td.to_td(feature_importances, 'boston.feature_importances', con=connection, if_exists='replace', index=False) model = SelectFromModel(reg, prefit=True) feature_idx = model.get_support() feature_name = df.drop(columns=['medv']).columns[feature_idx] selected_features = set(feature_name) categorical_columns = set(['rad', 'chas']) quantitative_columns = set(X.columns) - categorical_columns feature_types = { 'categorical_columns': categorical_columns, 'quantitative_columns': quantitative_columns } feature_query = self._feature_column_query(selected_features, feature_types=feature_types) # Store query if possible try: import digdag digdag.env.store({'feature_query': feature_query}) except ImportError: pass
def test_datetime_is_not_supported(self): client = self.connection.client # test frame = pd.DataFrame({'timestamp': [datetime.datetime(2000, 1, 1)]}) to_td(frame, 'test_db.test_table', self.connection)
def test_datetime_is_not_supported(self): client = self.connection.client # test frame = pd.DataFrame({'timestamp': [datetime.datetime(2000,1,1)]}) to_td(frame, 'test_db.test_table', self.connection)
def test_fail_if_exists(self): client = self.connection.client client.table = MagicMock() to_td(self.frame, 'test_db.test_table', self.connection)
# Set API key and start a session Set your API key to the environment variable TD_API_KEY and run "jupyter notebook": $ export TD_API_SERVER="https://api.treasuredata.com/" $ export TD_API_KEY="1234/abcd..." $ jupyter notebook con = td.connect(apikey=os.environ['TD_API_KEY'], endpoint=os.environ['TD_API_SERVER'], retry_post_requests=True) engine = td.create_engine('presto:sample_datasets', con=con) pip install pandas-td import pandas_td as td engine = td.create_engine('presto:sample_datasets') # Alternatively, initialize a connection explicitly con = td.connect(apikey=os.environ['TD_API_KEY'], endpoint=os.environ['TD_API_SERVER']) engine = td.create_engine('presto:sample_datasets', con=con) # con = td.connect() with td.connect() as con: td.to_td(df, 'my_db.test_table', con, if_exists='replace', index=False) # Import it into 'tutorial.import1' con = td.connect() td.to_td(df, 'tutorial.import1', con, if_exists='replace', index=False)
def test_invalid_if_exists(self): to_td(self.frame, 'test_db.test_table', self.connection, if_exists='invalid')
def test_invalid_table_name(self): to_td(self.frame, 'invalid', self.connection)
def run(with_aws=True): # Original code is published at official document of TensorFlow under Apache License Version 2.0 # https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub import sys os.system(f"{sys.executable} -m pip install pandas-td") os.system( f"{sys.executable} -m pip install tensorflow==1.13.1 tensorflow_hub==0.1.1" ) import tensorflow as tf import tensorflow_hub as hub import pandas_td as td con = td.connect(apikey=os.environ['TD_API_KEY'], endpoint=os.environ['TD_API_SERVER']) presto = td.create_engine('presto:sentiment', con=con) train_df = td.read_td( """ select rowid, sentence, sentiment, polarity from movie_review_train_shuffled """, presto) test_df = td.read_td( """ select rowid, sentence, sentiment, polarity from movie_review_test_shuffled """, presto) # Shuffle has been done by HiveQL in the shuffle task # train_df = train_df.sample(frac=1).reset_index(drop=True) with tf.Session(graph=tf.Graph()) as sess: train_input_fn = tf.estimator.inputs.pandas_input_fn( train_df, train_df["polarity"], num_epochs=None, shuffle=True) embedded_text_feature_column = hub.text_embedding_column( key="sentence", module_spec="https://tfhub.dev/google/nnlm-en-dim128/1") estimator = tf.estimator.DNNClassifier( hidden_units=[500, 100], feature_columns=[embedded_text_feature_column], n_classes=2, optimizer=tf.train.AdamOptimizer(learning_rate=0.003)) estimator.train(input_fn=train_input_fn, steps=1000) # Export TF model to S3 if with_aws: _upload_model(embedded_text_feature_column, estimator) predict_train_input_fn = tf.estimator.inputs.pandas_input_fn( train_df, train_df["polarity"], shuffle=False) predict_test_input_fn = tf.estimator.inputs.pandas_input_fn( test_df, test_df["polarity"], shuffle=False) train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn) test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn) print("Training set accuracy: {accuracy}".format(**train_eval_result)) print("Test set accuracy: {accuracy}".format(**test_eval_result)) results = get_predictions(estimator, predict_test_input_fn) # Store prediction results to Treasure Data test_df['predicted_polarity'] = results td.to_td(test_df[['rowid', 'predicted_polarity']], 'sentiment.test_predicted_polarities', con=con, if_exists='replace', index=False)
def test_fail_if_exists(self): client = self.connection.client client.table = MagicMock() with self.assertRaises(RuntimeError): to_td(self.frame, "test_db.test_table", self.connection)
def test_ok_if_not_exists(self): client = self.connection.client to_td(self.frame, "test_db.test_table", self.connection) client.table.assert_called_with("test_db", "test_table") client.create_log_table.assert_called_with("test_db", "test_table")
#!/usr/bin/python import os import sys import pandas as pd import pandas_td as td print "load.py started" con = td.connect(apikey="TD_APIKEY", endpoint='https://api.treasuredata.com') # Type: Presto, Database: sample_datasets engine = td.create_engine('presto:sample_datasets', con=con) # Read Treasure Data query from into a DataFrame. df = td.read_td_query(''' SELECT time, close FROM nasdaq LIMIT 100 ''', engine, index_col='time', parse_dates={'time': 's'}) print df.head # Output DataFrame to TreasureData via Streaming Import. (If your dataset is large, this method is not recommended.) td.to_td(df, 'workflow_temp.test_emr', con, if_exists='replace', index=False) print "load.py finished"
def run(): # Original code is published at official document of TensorFlow under Apache License Version 2.0 # https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub #os.system("pip install pandas-td tensorflow_hub boto3") import boto3 import tensorflow as tf import tensorflow_hub as hub import pandas_td as td con = td.connect(apikey=os.environ['TD_API_KEY'], endpoint=os.environ['TD_API_SERVER']) presto = td.create_engine('presto:sentiment', con=con) train_df = td.read_td( """ select rowid, sentence, sentiment, polarity from movie_review_train_shuffled """, presto) test_df = td.read_td( """ select rowid, sentence, sentiment, polarity from movie_review_test_shuffled """, presto) # Shuffle has been done by HiveQL in the shuffle task # train_df = train_df.sample(frac=1).reset_index(drop=True) with tf.Session(graph=tf.Graph()) as sess: train_input_fn = tf.estimator.inputs.pandas_input_fn( train_df, train_df["polarity"], num_epochs=None, shuffle=True) embedded_text_feature_column = hub.text_embedding_column( key="sentence", module_spec="https://tfhub.dev/google/nnlm-en-dim128/1") estimator = tf.estimator.DNNClassifier( hidden_units=[500, 100], feature_columns=[embedded_text_feature_column], n_classes=2, optimizer=tf.train.AdamOptimizer(learning_rate=0.003)) estimator.train(input_fn=train_input_fn, steps=1000) # Export TF model on S3 feature_spec = tf.feature_column.make_parse_example_spec( [embedded_text_feature_column]) serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn( feature_spec) estimator.export_saved_model(EXPORT_DIR_BASE, serving_input_receiver_fn) with tarfile.open('tfmodel.tar.gz', 'w:gz') as tar: tar.add(EXPORT_DIR_BASE, arcname=os.path.basename(EXPORT_DIR_BASE)) # Upload the TensorFlow model to S3 # boto3 assuming environment variables "AWS_ACCESS_KEY_ID" and "AWS_SECRET_ACCESS_KEY": # http://boto3.readthedocs.io/en/latest/guide/configuration.html#environment-variables s3 = boto3.resource('s3') # ACL should be chosen with your purpose s3.Bucket(os.environ['S3_BUCKET']).upload_file('tfmodel.tar.gz', 'tfmodel.tar.gz') predict_train_input_fn = tf.estimator.inputs.pandas_input_fn( train_df, train_df["polarity"], shuffle=False) predict_test_input_fn = tf.estimator.inputs.pandas_input_fn( test_df, test_df["polarity"], shuffle=False) train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn) test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn) print("Training set accuracy: {accuracy}".format(**train_eval_result)) print("Test set accuracy: {accuracy}".format(**test_eval_result)) results = get_predictions(estimator, predict_test_input_fn) # Store prediction results to Treasure Data test_df['predicted_polarity'] = results td.to_td(test_df[['rowid', 'predicted_polarity']], 'sentiment.test_predicted_polarities', con=con, if_exists='replace', index=False)
def test_ok_if_not_exists(self): client = self.connection.client to_td(self.frame, 'test_db.test_table', self.connection) client.table.assert_called_with('test_db', 'test_table') client.create_log_table.assert_called_with('test_db', 'test_table')