def upload(database, table, project_id, model_id, datasource_id): # SetUp DR_API_TOKEN = os.environ['DR_API_TOKEN'] TD_USERNAME = os.environ['TD_USERNAME'] TD_PASSWORD = os.environ['TD_PASSWORD'] TD_API_KEY = os.environ['TD_API_KEY'] TD_API_SERVER = os.environ['TD_API_SERVER'] MAX_WAIT = 60 * 60 # Maximum number of seconds to wait for prediction job to finish dr.Client(endpoint='https://app.datarobot.com/api/v2', token=DR_API_TOKEN) project = dr.Project.get(project_id) model = dr.Model.get(project_id, model_id) dataset = project.upload_dataset_from_data_source(datasource_id, TD_USERNAME, TD_PASSWORD) # Predict pred_job = model.request_predictions(dataset.id) pred_df = pred_job.get_result_when_complete(max_wait=MAX_WAIT) # Upload from pytd import pandas_td as td con = td.connect(apikey=TD_API_KEY, endpoint=TD_API_SERVER) td.to_td(pred_df, '{}.{}'.format(database, table), con=con, if_exists='replace', index=False)
def run(self, with_aws=True): import pytd.pandas_td as td from fbprophet import Prophet con = td.connect(apikey=self.apikey, endpoint=self.endpoint) engine = td.create_engine('presto:{}'.format(self.dbname), con=con) # Note: Prophet requires `ds` column as date string and `y` column as target value df = td.read_td( """ select ds, y from {} where ds between '{}' and '{}' """.format(self.source_table, self.start, self.end), engine) model = Prophet(seasonality_mode='multiplicative', mcmc_samples=300) model.fit(df) future = model.make_future_dataframe(periods=self.period) forecast = model.predict(future) if with_aws: self._upload_graph(model, forecast) # To avoid TypeError: can't serialize Timestamp, convert `pandas._libs.tslibs.timestamps.Timestamp` to `str` forecast.ds = forecast.ds.apply(str) # Store prediction results td.to_td(forecast, "{}.{}".format(self.dbname, self.target_table), con, if_exists='replace')
def get_records(api, basic, org, app_id, database, table, fields, query, id_field_code): # APIリスト読み込み api_list = eval(api) # TDへのコネクションを作成 writer = SparkWriter( td_spark_path=TDSparkContextBuilder.default_jar_path()) con = td.connect(writer=writer) # アプリ番号でループ for a in api_list: # app_idでアプリを指定 if a["id"] == app_id: # kintone APIの設定 url = f"https://{org}.cybozu.com/k/v1/records.json" headers = {"X-Cybozu-API-Token": a["key"], "Authorization": basic} payload = { "app": 1, "query": query, "fields": fields, "totalCount": "true" } r = requests.get(url, headers=headers, params=payload) count = int(json.loads(r.text)["totalCount"]) print(count) # GETしたデータをキャッシュするdf for i in itertools.islice(range(0, count), 0, None, 100): splited_query = (query + " order by " + id_field_code + " asc limit 100 offset " + f"{i}") print(splited_query) payload = {"app": 1, "query": splited_query, "fields": fields} r = requests.get(url, headers=headers, params=payload) if r.status_code != 200: sys.exit(1) else: data = json.loads(r.text) df = pd.DataFrame.from_dict(data) df = json_normalize(df["records"]) df = df.rename(columns=column_encode) # KintoneからGETしたアプリID = X のrecordsをTDのTableに格納 td.to_td( df, ".".join([database, table]), con, if_exists="append", index=False, )
def run( self, database="timeseries", source_table="retail_sales", target_table="predicted_sales", start_date="1993-01-01", end_date="2016-05-31", period=365, with_aws=True, ): import pytd.pandas_td as td from fbprophet import Prophet period = int(period) con = td.connect(apikey=self.apikey, endpoint=self.endpoint) engine = td.create_engine(f"presto:{database}", con=con) # Note: Prophet requires `ds` column as date string and `y` column as target value df = td.read_td( f""" select ds, y from {source_table} where ds between '{start_date}' and '{end_date}' """, engine, ) model = Prophet(seasonality_mode="multiplicative", mcmc_samples=300) model.fit(df) future = model.make_future_dataframe(periods=period) forecast = model.predict(future) if with_aws: self._upload_graph(model, forecast) # To avoid TypeError: can't serialize Timestamp, convert `pandas._libs.tslibs.timestamps.Timestamp` to `str` forecast.ds = forecast.ds.apply(str) # Store prediction results td.to_td( forecast, "{}.{}".format(database, target_table), con, if_exists="replace" )
# Because we do not allow custom image currently, this is how you can add 3rd party # libraries instead import os import sys import tdclient os.system(f"{sys.executable} -m pip install -U pytd==0.6.1") import pytd.pandas_td as td apikey = os.environ['TD_API_KEY'] endpoint = os.environ['TD_API_SERVER'] con = td.connect(apikey=apikey, endpoint=endpoint) def read_td_table(database_name, table_name, engine_name='presto', limit=1000): engine = td.create_engine(f"{engine_name}:{database_name}", con=con) df = td.read_td(f'SELECT * FROM {table_name} LIMIT {limit}', engine) print(df) def write_td_table(database_name, table_name): import pandas as pd import random # TODO TD client, check for table's existence engine = td.create_engine(f"presto:{database_name}", con=con) df = pd.DataFrame({"c": [random.random() for _ in range(20)]}) # Manipulating data in Treasure Data via Python. # Uses https://github.com/treasure-data/td-client-python
def run_batch( database, input_table, output_table, device, model, vocab, setup, batchsize=64 ): def predict_batch(words_batch): xs = nlp_utils.transform_to_array(words_batch, vocab, with_label=False) xs = nlp_utils.convert_seq(xs, device=device, with_label=False) with chainer.using_config("train", False), chainer.no_backprop_mode(): probs = model.predict(xs, softmax=True) # Note: Prediction labels are different from original Chainer example # positive: 1, negative: 0 answers = model.xp.argmax(probs, axis=1) scores = probs[model.xp.arange(answers.size), answers].tolist() return answers, scores td_api_key = os.environ["TD_API_KEY"] endpoint = os.environ["TD_API_SERVER"] logger.info("Connect to Treasure Data") con = td.connect() presto = td.create_engine(f"presto:{database}", con=con) logger.info("Fetch data from Treasure Data") test_df = td.read_td( f""" select rowid, sentence, sentiment, polarity from {input_table} """, presto, ) sentences = test_df["sentence"].tolist() logger.info("Start prediction") batch = [] predicted = [] i = 1 for sentence in sentences: text = nlp_utils.normalize_text(sentence) words = nlp_utils.split_text(text, char_based=setup["char_based"]) batch.append(words) if len(batch) >= batchsize: _predicted, _ = predict_batch(batch) predicted.append(_predicted) batch = [] logger.info(f"Predicted: {i}th batch. batch size {batchsize}") i += 1 if batch: _predicted, _ = predict_batch(batch) predicted.append(_predicted) logger.info("Finish prediction") test_df["predicted_polarity"] = numpy.concatenate(predicted, axis=None) # Note: Train test split strategy is different from pre trained model and # these tables so that the model includes test data since the model # is trained by Chainer official example. # This accuracy is just for a demo. # # accuracy = (test_df.polarity == test_df.predicted_polarity).value_counts()[ # 1 # ] / len(test_df) # print(f"Test set accuracy: {accuracy}") con2 = td.connect(apikey=td_api_key, endpoint=endpoint) td.to_td( test_df[["rowid", "predicted_polarity"]], f"{database}.{output_table}", con=con2, if_exists="replace", index=False, ) logger.info("Upload completed")
def run(self): import pandas as pd import pytd.pandas_td as td from sklearn.ensemble import ExtraTreesRegressor from sklearn.feature_selection import SelectFromModel connection = td.connect(apikey=self.apikey, endpoint=self.endpoint) dbname = self.dbname source_table = self.source_table engine = td.create_engine('presto:{}'.format(dbname), con=connection) # Fetch 25% random sampled data df = td.read_td( """ select * from {} tablesample bernoulli(25) """.format(source_table), engine) # You can use Hive instead: # # engine_hive = td.create_engine('hive:{}'.format(dbname), con=connection) # df = td.read_td( # """ # select * # from {}_train # where rnd < 0.25 # """.format(source_table), # engine_hive # ) df = df.drop(columns=['time', 'v', 'rnd', 'rowid'], errors='ignore') y = df.medv X = df.drop(columns=['medv']) categorical_columns = set(['rad', 'chas']) quantitative_columns = set(X.columns) - categorical_columns reg = ExtraTreesRegressor() reg = reg.fit(X, y) feature_importances = pd.DataFrame({ 'column': X.columns, 'importance': reg.feature_importances_ }) td.to_td(feature_importances, 'boston.feature_importances', con=connection, if_exists='replace', index=False) model = SelectFromModel(reg, prefit=True) feature_idx = model.get_support() feature_name = df.drop(columns=['medv']).columns[feature_idx] selected_features = set(feature_name) categorical_columns = set(['rad', 'chas']) quantitative_columns = set(X.columns) - categorical_columns feature_types = { 'categorical_columns': categorical_columns, 'quantitative_columns': quantitative_columns } feature_query = self._feature_column_query(selected_features, feature_types=feature_types) # Store query if possible try: import digdag digdag.env.store({'feature_query': feature_query}) except ImportError: pass
def run(with_aws=True): # Original code is published at official document of TensorFlow under Apache License Version 2.0 # https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub import sys os.system( f"{sys.executable} -m pip install tensorflow==1.13.1 tensorflow_hub==0.1.1" ) os.system(f"{sys.executable} -m pip install -U pytd==0.6.1") import tensorflow as tf import tensorflow_hub as hub import pytd.pandas_td as td con = td.connect(apikey=os.environ['TD_API_KEY'], endpoint=os.environ['TD_API_SERVER']) presto = td.create_engine('presto:sentiment', con=con) train_df = td.read_td( """ select rowid, sentence, sentiment, polarity from movie_review_train_shuffled """, presto) test_df = td.read_td( """ select rowid, sentence, sentiment, polarity from movie_review_test_shuffled """, presto) # Shuffle has been done by HiveQL in the shuffle task # train_df = train_df.sample(frac=1).reset_index(drop=True) with tf.Session(graph=tf.Graph()) as sess: train_input_fn = tf.estimator.inputs.pandas_input_fn( train_df, train_df["polarity"], num_epochs=None, shuffle=True) embedded_text_feature_column = hub.text_embedding_column( key="sentence", module_spec="https://tfhub.dev/google/nnlm-en-dim128/1") estimator = tf.estimator.DNNClassifier( hidden_units=[500, 100], feature_columns=[embedded_text_feature_column], n_classes=2, optimizer=tf.train.AdamOptimizer(learning_rate=0.003)) estimator.train(input_fn=train_input_fn, steps=1000) # Export TF model to S3 if with_aws: _upload_model(embedded_text_feature_column, estimator) predict_train_input_fn = tf.estimator.inputs.pandas_input_fn( train_df, train_df["polarity"], shuffle=False) predict_test_input_fn = tf.estimator.inputs.pandas_input_fn( test_df, test_df["polarity"], shuffle=False) train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn) test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn) print("Training set accuracy: {accuracy}".format(**train_eval_result)) print("Test set accuracy: {accuracy}".format(**test_eval_result)) results = get_predictions(estimator, predict_test_input_fn) # Store prediction results to Treasure Data test_df['predicted_polarity'] = results td.to_td(test_df[['rowid', 'predicted_polarity']], 'sentiment.test_predicted_polarities', con=con, if_exists='replace', index=False)
def run(self, dbname="boston", source_table="house_prices"): import pandas as pd import pytd.pandas_td as td from sklearn.ensemble import ExtraTreesRegressor from sklearn.feature_selection import SelectFromModel connection = td.connect(apikey=self.apikey, endpoint=self.endpoint) engine = td.create_engine(f"presto:{dbname}", con=connection) # Fetch 25% random sampled data df = td.read_td( f""" select * from {source_table} tablesample bernoulli(25) """, engine, ) # You can use Hive instead: # # engine_hive = td.create_engine(f'hive:{dbname}', con=connection) # df = td.read_td( # """ # select * # from {}_train # where rnd < 0.25 # """.format(source_table), # engine_hive # ) df = df.drop(columns=["time", "v", "rnd", "rowid"], errors="ignore") y = df.medv X = df.drop(columns=["medv"]) categorical_columns = set(["rad", "chas"]) quantitative_columns = set(X.columns) - categorical_columns reg = ExtraTreesRegressor() reg = reg.fit(X, y) feature_importances = pd.DataFrame({ "column": X.columns, "importance": reg.feature_importances_ }) td.to_td( feature_importances, f"{dbname}.feature_importances", con=connection, if_exists="replace", index=False, ) model = SelectFromModel(reg, prefit=True) feature_idx = model.get_support() feature_name = df.drop(columns=["medv"]).columns[feature_idx] selected_features = set(feature_name) categorical_columns = set(["rad", "chas"]) quantitative_columns = set(X.columns) - categorical_columns feature_types = { "categorical_columns": categorical_columns, "quantitative_columns": quantitative_columns, } feature_query = self._feature_column_query(selected_features, feature_types=feature_types) # Store query if possible try: import digdag digdag.env.store({"feature_query": feature_query}) except ImportError: pass