def run(self, with_aws=True): import pandas_td as td from fbprophet import Prophet con = td.connect(apikey=self.apikey, endpoint=self.endpoint) engine = td.create_engine('presto:{}'.format(self.dbname), con=con) # Note: Prophet requires `ds` column as date string and `y` column as target value df = td.read_td( """ select ds, y from {} where ds between '{}' and '{}' """.format(self.source_table, self.start, self.end), engine) model = Prophet(seasonality_mode='multiplicative', mcmc_samples=300) model.fit(df) future = model.make_future_dataframe(periods=self.period) forecast = model.predict(future) if with_aws: self._upload_graph(model, forecast) # To avoid TypeError: can't serialize Timestamp, convert `pandas._libs.tslibs.timestamps.Timestamp` to `str` forecast.ds = forecast.ds.apply(str) # Store prediction results td.to_td(forecast, "{}.{}".format(self.dbname, self.target_table), con, if_exists='replace')
def run(self): import boto3 import matplotlib as mlp mlp.use('agg') from matplotlib import pyplot as plt import pandas_td as td from fbprophet import Prophet con = td.connect(apikey=self.apikey, endpoint=self.endpoint) engine = td.create_engine('presto:{}'.format(self.dbname), con=con) # Note: Prophet requires `ds` column as date string and `y` column as target value df = td.read_td( """ select ds, y from {} where ds between '{}' and '{}' """.format(self.source_table, self.start, self.end), engine) model = Prophet(seasonality_mode='multiplicative', mcmc_samples=300) model.fit(df) future = model.make_future_dataframe(periods=self.period) forecast = model.predict(future) fig1 = model.plot(forecast) fig2 = model.plot_components(forecast) predict_fig_data = io.BytesIO() component_fig_data = io.BytesIO() fig1.savefig(predict_fig_data, format='png') fig2.savefig(component_fig_data, format='png') predict_fig_data.seek(0) component_fig_data.seek(0) # Upload figures to S3 # boto3 assuming environment variables "AWS_ACCESS_KEY_ID" and "AWS_SECRET_ACCESS_KEY": # http://boto3.readthedocs.io/en/latest/guide/configuration.html#environment-variables s3 = boto3.resource('s3') predicted_fig_file = "predicted.png" component_fig_file = "component.png" # ACL should be chosen with your purpose s3.Object(os.environ['S3_BUCKET'], predicted_fig_file).put(ACL='public-read', Body=predict_fig_data, ContentType='image/png') s3.Object(os.environ['S3_BUCKET'], component_fig_file).put(ACL='public-read', Body=component_fig_data, ContentType='image/png') # To avoid TypeError: can't serialize Timestamp, convert `pandas._libs.tslibs.timestamps.Timestamp` to `str` forecast.ds = forecast.ds.apply(str) # Store prediction results td.to_td(forecast, "{}.{}".format(self.dbname, self.target_table), con, if_exists='replace')
def run_dynamic_query(parameters): #0. Initialize our connection to Treasure Data apikey = os.environ['MASTER_TD_API_KEY'] endpoint = 'https://api.treasuredata.com' con = td.connect(apikey, endpoint) #1. Connect to the query engine con_engine = con.query_engine(database=parameters['db_name'], type=parameters['query_engine']) #2. Setup query limit string if parameters['limit'] != '0': limit_str = "LIMIT " + str(parameters['limit']) + ";" else: limit_str = ";" #3. Compose Query String if not 'min_time' in parameters.keys(): parameters['min_time'] = 'NULL' if not 'max_time' in parameters.keys(): parameters['max_time'] = 'NULL' if parameters['min_time'] == 'NULL' and parameters['max_time'] == 'NULL': compose_query = "SELECT " + parameters['col_list'] + " " + \ "FROM " + parameters['table_name'] + " " + limit_str else: compose_query = "SELECT " + parameters['col_list'] + " " + \ "FROM " + parameters['table_name'] + " " + \ "WHERE " + "td_time_range(time," + parameters['min_time'] + "," + parameters['max_time'] + ") " + \ limit_str print("Executing..." + compose_query) #4. Run query as a job and wait for job to finish #Assign result set to a data frame with tdclient.Client(apikey) as client: job = client.query(parameters['db_name'], compose_query, type=parameters['query_engine']) job.wait() try: #Assign result set to a data frame df = td.read_td_job(job.job_id, con_engine) except RuntimeError: print("Please review the column names and delimited by commas: " + parameters['col_list']) return #5. Write the results to a csv or tabular format file if parameters['format'] == 'csv': print("Downloading results to " + job.job_id + ".csv" + " file") df.to_csv(job.job_id + ".csv") else: #Write data into tabular grid format print("Downloading results to " + job.job_id + ".txt" + " file") filename = job.job_id + ".txt" outfile = open(filename, "a") outfile.write(tabulate(df, tablefmt="grid")) outfile.close()
def run_dynamic_query(parameters): #0. Initialize our connection to Treasure Data apikey=os.environ['MASTER_TD_API_KEY'] endpoint='https://api.treasuredata.com' con = td.connect(apikey, endpoint) #1. Connect to the query engine con_engine=con.query_engine(database=parameters['db_name'], type=parameters['query_engine']) #2. Setup query limit string if parameters['limit'] != '0': limit_str = "LIMIT " + str(parameters['limit']) + ";" else: limit_str = ";" #3. Compose Query String if not 'min_time' in parameters.keys(): parameters['min_time'] = 'NULL' if not 'max_time' in parameters.keys(): parameters['max_time'] = 'NULL' if parameters['min_time'] == 'NULL' and parameters['max_time'] == 'NULL': compose_query = "SELECT " + parameters['col_list'] + " " + \ "FROM " + parameters['table_name'] + " " + limit_str else: compose_query = "SELECT " + parameters['col_list'] + " " + \ "FROM " + parameters['table_name'] + " " + \ "WHERE " + "td_time_range(time," + parameters['min_time'] + "," + parameters['max_time'] + ") " + \ limit_str print("Executing..." + compose_query) #4. Run query as a job and wait for job to finish #Assign result set to a data frame with tdclient.Client(apikey) as client: job = client.query(parameters['db_name'],compose_query,type=parameters['query_engine']) job.wait() try: #Assign result set to a data frame df = td.read_td_job(job.job_id, con_engine) except RuntimeError: print("Please review the column names and delimited by commas: " + parameters['col_list']) return #5. Write the results to a csv or tabular format file if parameters['format'] == 'csv': print("Downloading results to " + job.job_id + ".csv" + " file") df.to_csv(job.job_id + ".csv") else: #Write data into tabular grid format print("Downloading results to " + job.job_id + ".txt" + " file") filename = job.job_id + ".txt" outfile = open(filename,"a") outfile.write(tabulate(df, tablefmt="grid")) outfile.close()
def jspca(self): os.system('pip install pandas') os.system('pip install scipy') os.system('pip install sklearn') os.system('pip install pandas-td') os.system('pip install pyyaml') from sklearn.decomposition import PCA import pandas as pd import pandas_td import yaml from scipy.spatial.distance import pdist, squareform from scipy.stats import entropy def _js(_P, _Q): _M = 0.5 * (_P + _Q) return 0.5 * (entropy(_P, _M) + entropy(_Q, _M)) with open('config/params.yml') as f: params = yaml.load(f) apikey = os.environ.get("python_apikey") dbname = params['dbname'] connection = pandas_td.connect(apikey=apikey) engine = pandas_td.create_engine('presto:{}'.format(dbname), con=connection) df = pandas_td.read_td( 'select label, lambda from pca_input order by label asc', engine) pca = PCA(n_components=2, random_state=0) dist = [] for index, row in df.iterrows(): dist.append([0 if v is None else v for v in row['lambda'][2:]]) dist_matrix = squareform(pdist(dist, metric=_js)) result_df = pd.DataFrame(pca.fit_transform(dist_matrix), columns=['x', 'y']) pandas_td.to_td(result_df, '{}.principal_component'.format(dbname), connection, if_exists='replace')
#!/usr/bin/python import os import sys import pandas as pd import pandas_td as td print "load.py started" con = td.connect(apikey="TD_APIKEY", endpoint='https://api.treasuredata.com') # Type: Presto, Database: sample_datasets engine = td.create_engine('presto:sample_datasets', con=con) # Read Treasure Data query from into a DataFrame. df = td.read_td_query(''' SELECT time, close FROM nasdaq LIMIT 100 ''', engine, index_col='time', parse_dates={'time': 's'}) print df.head # Output DataFrame to TreasureData via Streaming Import. (If your dataset is large, this method is not recommended.) td.to_td(df, 'workflow_temp.test_emr', con, if_exists='replace', index=False) print "load.py finished"
def connect(self): return td.connect()
def setUp(self): self.connection = connect('test-key', 'test-endpoint') self.frame = pd.DataFrame([[1,2],[3,4]], columns=['x', 'y'])
def setUp(self): job = MockJob() self.connection = connect('test-key', 'test-endpoint') self.connection.client.query = MagicMock(return_value=job) self.engine = self.connection.query_engine('test_db', type='presto') self.engine._http_get = MagicMock(return_value=MockRequest(job))
def setUp(self): job = MockJob() self.connection = connect("test-key", "test-endpoint") self.connection.client.query = MagicMock(return_value=job) self.engine = self.connection.query_engine("test_db", type="presto") self.engine._http_get = MagicMock(return_value=MockRequest(job))
import pandas_td as td import pandas as pd import tdclient as tdc import os apikey = os.environ['MASTER_TD_API_KEY'] con = td.connect(apikey, endpoint='https://api.treasuredata.com/') ##################################### def gettablename(tabName='tab_1'): '''gettablename function validates and returns name of the database Input: name of the table Output: name of the table''' #logging.info("Validate table name entered by user") tabFlag = False while not tabFlag: if tabName != 'tab_1': print("wrong value") break else: dbFlag = True break return tabName ##################################### def getdefaultcolumnlist(tabName, dbName, qEngine): engine = con.query_engine(database=dbName, type=qEngine)
def run(with_aws=True): # Original code is published at official document of TensorFlow under Apache License Version 2.0 # https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub import sys os.system(f"{sys.executable} -m pip install pandas-td") os.system( f"{sys.executable} -m pip install tensorflow==1.13.1 tensorflow_hub==0.1.1" ) import tensorflow as tf import tensorflow_hub as hub import pandas_td as td con = td.connect(apikey=os.environ['TD_API_KEY'], endpoint=os.environ['TD_API_SERVER']) presto = td.create_engine('presto:sentiment', con=con) train_df = td.read_td( """ select rowid, sentence, sentiment, polarity from movie_review_train_shuffled """, presto) test_df = td.read_td( """ select rowid, sentence, sentiment, polarity from movie_review_test_shuffled """, presto) # Shuffle has been done by HiveQL in the shuffle task # train_df = train_df.sample(frac=1).reset_index(drop=True) with tf.Session(graph=tf.Graph()) as sess: train_input_fn = tf.estimator.inputs.pandas_input_fn( train_df, train_df["polarity"], num_epochs=None, shuffle=True) embedded_text_feature_column = hub.text_embedding_column( key="sentence", module_spec="https://tfhub.dev/google/nnlm-en-dim128/1") estimator = tf.estimator.DNNClassifier( hidden_units=[500, 100], feature_columns=[embedded_text_feature_column], n_classes=2, optimizer=tf.train.AdamOptimizer(learning_rate=0.003)) estimator.train(input_fn=train_input_fn, steps=1000) # Export TF model to S3 if with_aws: _upload_model(embedded_text_feature_column, estimator) predict_train_input_fn = tf.estimator.inputs.pandas_input_fn( train_df, train_df["polarity"], shuffle=False) predict_test_input_fn = tf.estimator.inputs.pandas_input_fn( test_df, test_df["polarity"], shuffle=False) train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn) test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn) print("Training set accuracy: {accuracy}".format(**train_eval_result)) print("Test set accuracy: {accuracy}".format(**test_eval_result)) results = get_predictions(estimator, predict_test_input_fn) # Store prediction results to Treasure Data test_df['predicted_polarity'] = results td.to_td(test_df[['rowid', 'predicted_polarity']], 'sentiment.test_predicted_polarities', con=con, if_exists='replace', index=False)
# Set API key and start a session Set your API key to the environment variable TD_API_KEY and run "jupyter notebook": $ export TD_API_SERVER="https://api.treasuredata.com/" $ export TD_API_KEY="1234/abcd..." $ jupyter notebook con = td.connect(apikey=os.environ['TD_API_KEY'], endpoint=os.environ['TD_API_SERVER'], retry_post_requests=True) engine = td.create_engine('presto:sample_datasets', con=con) pip install pandas-td import pandas_td as td engine = td.create_engine('presto:sample_datasets') # Alternatively, initialize a connection explicitly con = td.connect(apikey=os.environ['TD_API_KEY'], endpoint=os.environ['TD_API_SERVER']) engine = td.create_engine('presto:sample_datasets', con=con) # con = td.connect() with td.connect() as con: td.to_td(df, 'my_db.test_table', con, if_exists='replace', index=False) # Import it into 'tutorial.import1' con = td.connect() td.to_td(df, 'tutorial.import1', con, if_exists='replace', index=False)
def setUp(self): self.connection = connect('test-key', 'test-endpoint') self.connection.client = self.mock_client() self.frame = pd.DataFrame([[1, 2], [3, 4]], columns=['x', 'y'])
def __init__(self, apikey, endpoint, database='sample_datasets'): self.conn = td.connect(apikey=apikey, endpoint=endpoint) self.database = database self.engine = td.create_engine('presto:{}'.format(database), self.conn)
def run(self): import pandas as pd import pandas_td as td from sklearn.ensemble import ExtraTreesRegressor from sklearn.feature_selection import SelectFromModel connection = td.connect(apikey=self.apikey, endpoint=self.endpoint) dbname = self.dbname source_table = self.source_table engine = td.create_engine('presto:{}'.format(dbname), con=connection) # Fetch 25% random sampled data df = td.read_td( """ select * from {} tablesample bernoulli(25) """.format(source_table), engine) # You can use Hive instead: # # engine_hive = td.create_engine('hive:{}'.format(dbname), con=connection) # df = td.read_td( # """ # select * # from {}_train # where rnd < 0.25 # """.format(source_table), # engine_hive # ) df = df.drop(columns=['time', 'v', 'rnd', 'rowid'], errors='ignore') y = df.medv X = df.drop(columns=['medv']) categorical_columns = set(['rad', 'chas']) quantitative_columns = set(X.columns) - categorical_columns reg = ExtraTreesRegressor() reg = reg.fit(X, y) feature_importances = pd.DataFrame({ 'column': X.columns, 'importance': reg.feature_importances_ }) td.to_td(feature_importances, 'boston.feature_importances', con=connection, if_exists='replace', index=False) model = SelectFromModel(reg, prefit=True) feature_idx = model.get_support() feature_name = df.drop(columns=['medv']).columns[feature_idx] selected_features = set(feature_name) categorical_columns = set(['rad', 'chas']) quantitative_columns = set(X.columns) - categorical_columns feature_types = { 'categorical_columns': categorical_columns, 'quantitative_columns': quantitative_columns } feature_query = self._feature_column_query(selected_features, feature_types=feature_types) # Store query if possible try: import digdag digdag.env.store({'feature_query': feature_query}) except ImportError: pass
def setUp(self): self.connection = connect("test-key", "test-endpoint") self.connection.client = self.mock_client() self.frame = pd.DataFrame([[1, 2], [3, 4]], columns=["x", "y"])
def run(): # Original code is published at official document of TensorFlow under Apache License Version 2.0 # https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub #os.system("pip install pandas-td tensorflow_hub boto3") import boto3 import tensorflow as tf import tensorflow_hub as hub import pandas_td as td con = td.connect(apikey=os.environ['TD_API_KEY'], endpoint=os.environ['TD_API_SERVER']) presto = td.create_engine('presto:sentiment', con=con) train_df = td.read_td( """ select rowid, sentence, sentiment, polarity from movie_review_train_shuffled """, presto) test_df = td.read_td( """ select rowid, sentence, sentiment, polarity from movie_review_test_shuffled """, presto) # Shuffle has been done by HiveQL in the shuffle task # train_df = train_df.sample(frac=1).reset_index(drop=True) with tf.Session(graph=tf.Graph()) as sess: train_input_fn = tf.estimator.inputs.pandas_input_fn( train_df, train_df["polarity"], num_epochs=None, shuffle=True) embedded_text_feature_column = hub.text_embedding_column( key="sentence", module_spec="https://tfhub.dev/google/nnlm-en-dim128/1") estimator = tf.estimator.DNNClassifier( hidden_units=[500, 100], feature_columns=[embedded_text_feature_column], n_classes=2, optimizer=tf.train.AdamOptimizer(learning_rate=0.003)) estimator.train(input_fn=train_input_fn, steps=1000) # Export TF model on S3 feature_spec = tf.feature_column.make_parse_example_spec( [embedded_text_feature_column]) serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn( feature_spec) estimator.export_saved_model(EXPORT_DIR_BASE, serving_input_receiver_fn) with tarfile.open('tfmodel.tar.gz', 'w:gz') as tar: tar.add(EXPORT_DIR_BASE, arcname=os.path.basename(EXPORT_DIR_BASE)) # Upload the TensorFlow model to S3 # boto3 assuming environment variables "AWS_ACCESS_KEY_ID" and "AWS_SECRET_ACCESS_KEY": # http://boto3.readthedocs.io/en/latest/guide/configuration.html#environment-variables s3 = boto3.resource('s3') # ACL should be chosen with your purpose s3.Bucket(os.environ['S3_BUCKET']).upload_file('tfmodel.tar.gz', 'tfmodel.tar.gz') predict_train_input_fn = tf.estimator.inputs.pandas_input_fn( train_df, train_df["polarity"], shuffle=False) predict_test_input_fn = tf.estimator.inputs.pandas_input_fn( test_df, test_df["polarity"], shuffle=False) train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn) test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn) print("Training set accuracy: {accuracy}".format(**train_eval_result)) print("Test set accuracy: {accuracy}".format(**test_eval_result)) results = get_predictions(estimator, predict_test_input_fn) # Store prediction results to Treasure Data test_df['predicted_polarity'] = results td.to_td(test_df[['rowid', 'predicted_polarity']], 'sentiment.test_predicted_polarities', con=con, if_exists='replace', index=False)