Example #1
0
    def run(self, with_aws=True):
        import pandas_td as td
        from fbprophet import Prophet

        con = td.connect(apikey=self.apikey, endpoint=self.endpoint)

        engine = td.create_engine('presto:{}'.format(self.dbname), con=con)

        # Note: Prophet requires `ds` column as date string and `y` column as target value
        df = td.read_td(
            """
            select ds, y
            from {}
            where ds between '{}' and '{}'
            """.format(self.source_table, self.start, self.end), engine)

        model = Prophet(seasonality_mode='multiplicative', mcmc_samples=300)
        model.fit(df)
        future = model.make_future_dataframe(periods=self.period)
        forecast = model.predict(future)

        if with_aws:
            self._upload_graph(model, forecast)

        # To avoid TypeError: can't serialize Timestamp, convert `pandas._libs.tslibs.timestamps.Timestamp` to `str`
        forecast.ds = forecast.ds.apply(str)

        # Store prediction results
        td.to_td(forecast,
                 "{}.{}".format(self.dbname, self.target_table),
                 con,
                 if_exists='replace')
Example #2
0
    def run(self):
        import boto3
        import matplotlib as mlp
        mlp.use('agg')
        from matplotlib import pyplot as plt
        import pandas_td as td
        from fbprophet import Prophet

        con = td.connect(apikey=self.apikey, endpoint=self.endpoint)

        engine = td.create_engine('presto:{}'.format(self.dbname), con=con)

        # Note: Prophet requires `ds` column as date string and `y` column as target value
        df = td.read_td(
            """
            select ds, y
            from {}
            where ds between '{}' and '{}'
            """.format(self.source_table, self.start, self.end), engine)

        model = Prophet(seasonality_mode='multiplicative', mcmc_samples=300)
        model.fit(df)
        future = model.make_future_dataframe(periods=self.period)
        forecast = model.predict(future)

        fig1 = model.plot(forecast)
        fig2 = model.plot_components(forecast)
        predict_fig_data = io.BytesIO()
        component_fig_data = io.BytesIO()
        fig1.savefig(predict_fig_data, format='png')
        fig2.savefig(component_fig_data, format='png')
        predict_fig_data.seek(0)
        component_fig_data.seek(0)

        # Upload figures to S3
        # boto3 assuming environment variables "AWS_ACCESS_KEY_ID" and "AWS_SECRET_ACCESS_KEY":
        # http://boto3.readthedocs.io/en/latest/guide/configuration.html#environment-variables
        s3 = boto3.resource('s3')

        predicted_fig_file = "predicted.png"
        component_fig_file = "component.png"

        # ACL should be chosen with your purpose
        s3.Object(os.environ['S3_BUCKET'],
                  predicted_fig_file).put(ACL='public-read',
                                          Body=predict_fig_data,
                                          ContentType='image/png')
        s3.Object(os.environ['S3_BUCKET'],
                  component_fig_file).put(ACL='public-read',
                                          Body=component_fig_data,
                                          ContentType='image/png')

        # To avoid TypeError: can't serialize Timestamp, convert `pandas._libs.tslibs.timestamps.Timestamp` to `str`
        forecast.ds = forecast.ds.apply(str)

        # Store prediction results
        td.to_td(forecast,
                 "{}.{}".format(self.dbname, self.target_table),
                 con,
                 if_exists='replace')
Example #3
0
def run_dynamic_query(parameters):
    #0. Initialize our connection to Treasure Data
    apikey = os.environ['MASTER_TD_API_KEY']
    endpoint = 'https://api.treasuredata.com'
    con = td.connect(apikey, endpoint)
    #1. Connect to the query engine
    con_engine = con.query_engine(database=parameters['db_name'],
                                  type=parameters['query_engine'])

    #2. Setup query limit string
    if parameters['limit'] != '0':
        limit_str = "LIMIT " + str(parameters['limit']) + ";"
    else:
        limit_str = ";"

#3. Compose Query String
    if not 'min_time' in parameters.keys():
        parameters['min_time'] = 'NULL'

    if not 'max_time' in parameters.keys():
        parameters['max_time'] = 'NULL'

    if parameters['min_time'] == 'NULL' and parameters['max_time'] == 'NULL':
        compose_query = "SELECT " + parameters['col_list']   + " " + \
                        "FROM "   + parameters['table_name'] + " " + limit_str

    else:
        compose_query = "SELECT " + parameters['col_list']   + " " + \
                        "FROM "   + parameters['table_name'] + " " + \
                        "WHERE "  + "td_time_range(time,"    + parameters['min_time'] + "," + parameters['max_time'] + ") " + \
                        limit_str

    print("Executing..." + compose_query)
    #4. Run query as a job and wait for job to finish
    #Assign result set to a data frame

    with tdclient.Client(apikey) as client:
        job = client.query(parameters['db_name'],
                           compose_query,
                           type=parameters['query_engine'])
        job.wait()
        try:
            #Assign result set to a data frame
            df = td.read_td_job(job.job_id, con_engine)
        except RuntimeError:
            print("Please review the column names and delimited by commas: " +
                  parameters['col_list'])
            return

#5. Write the results to a csv or tabular format file
    if parameters['format'] == 'csv':
        print("Downloading results to " + job.job_id + ".csv" + " file")
        df.to_csv(job.job_id + ".csv")
    else:
        #Write data into tabular grid format
        print("Downloading results to " + job.job_id + ".txt" + " file")
        filename = job.job_id + ".txt"
        outfile = open(filename, "a")
        outfile.write(tabulate(df, tablefmt="grid"))
        outfile.close()
Example #4
0
def run_dynamic_query(parameters):
    #0. Initialize our connection to Treasure Data
    apikey=os.environ['MASTER_TD_API_KEY']
    endpoint='https://api.treasuredata.com'
    con = td.connect(apikey, endpoint)
    #1. Connect to the query engine
    con_engine=con.query_engine(database=parameters['db_name'], type=parameters['query_engine'])

	#2. Setup query limit string
    if parameters['limit'] != '0':
        limit_str = "LIMIT " + str(parameters['limit']) + ";"
    else:
        limit_str = ";"

	#3. Compose Query String
    if not 'min_time' in parameters.keys():
        parameters['min_time'] = 'NULL'

    if not 'max_time' in parameters.keys():
        parameters['max_time'] = 'NULL'

    if parameters['min_time'] == 'NULL' and parameters['max_time'] == 'NULL':
        compose_query = "SELECT " + parameters['col_list']   + " " + \
                        "FROM "   + parameters['table_name'] + " " + limit_str

    else:
        compose_query = "SELECT " + parameters['col_list']   + " " + \
                        "FROM "   + parameters['table_name'] + " " + \
                        "WHERE "  + "td_time_range(time,"    + parameters['min_time'] + "," + parameters['max_time'] + ") " + \
                        limit_str

    print("Executing..." + compose_query)
    #4. Run query as a job and wait for job to finish
	#Assign result set to a data frame

    with tdclient.Client(apikey) as client:
        job = client.query(parameters['db_name'],compose_query,type=parameters['query_engine'])
        job.wait()
        try:
            #Assign result set to a data frame
            df = td.read_td_job(job.job_id, con_engine)
        except RuntimeError:
            print("Please review the column names and delimited by commas: " + parameters['col_list'])
            return

	#5. Write the results to a csv or tabular format file
    if parameters['format'] == 'csv':
        print("Downloading results to " + job.job_id + ".csv" + " file")
        df.to_csv(job.job_id + ".csv")
    else:
        #Write data into tabular grid format
        print("Downloading results to " + job.job_id + ".txt" + " file")
        filename = job.job_id + ".txt"
        outfile = open(filename,"a")
        outfile.write(tabulate(df, tablefmt="grid"))
        outfile.close()
Example #5
0
    def jspca(self):
        os.system('pip install pandas')
        os.system('pip install scipy')
        os.system('pip install sklearn')
        os.system('pip install pandas-td')
        os.system('pip install pyyaml')

        from sklearn.decomposition import PCA
        import pandas as pd
        import pandas_td
        import yaml
        from scipy.spatial.distance import pdist, squareform
        from scipy.stats import entropy

        def _js(_P, _Q):
            _M = 0.5 * (_P + _Q)
            return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

        with open('config/params.yml') as f:
            params = yaml.load(f)

        apikey = os.environ.get("python_apikey")
        dbname = params['dbname']

        connection = pandas_td.connect(apikey=apikey)

        engine = pandas_td.create_engine('presto:{}'.format(dbname),
                                         con=connection)

        df = pandas_td.read_td(
            'select label, lambda from pca_input order by label asc', engine)

        pca = PCA(n_components=2, random_state=0)

        dist = []
        for index, row in df.iterrows():
            dist.append([0 if v is None else v for v in row['lambda'][2:]])

        dist_matrix = squareform(pdist(dist, metric=_js))

        result_df = pd.DataFrame(pca.fit_transform(dist_matrix),
                                 columns=['x', 'y'])

        pandas_td.to_td(result_df,
                        '{}.principal_component'.format(dbname),
                        connection,
                        if_exists='replace')
Example #6
0
#!/usr/bin/python

import os
import sys
import pandas as pd
import pandas_td as td

print "load.py started"

con = td.connect(apikey="TD_APIKEY", endpoint='https://api.treasuredata.com')

# Type: Presto, Database: sample_datasets
engine = td.create_engine('presto:sample_datasets', con=con)

# Read Treasure Data query from into a DataFrame.
df = td.read_td_query('''
SELECT time, close FROM nasdaq LIMIT 100
''',
                      engine,
                      index_col='time',
                      parse_dates={'time': 's'})

print df.head

# Output DataFrame to TreasureData via Streaming Import. (If your dataset is large, this method is not recommended.)
td.to_td(df, 'workflow_temp.test_emr', con, if_exists='replace', index=False)

print "load.py finished"
Example #7
0
 def connect(self):
     return td.connect()
Example #8
0
 def setUp(self):
     self.connection = connect('test-key', 'test-endpoint')
     self.frame = pd.DataFrame([[1,2],[3,4]], columns=['x', 'y'])
Example #9
0
 def setUp(self):
     job = MockJob()
     self.connection = connect('test-key', 'test-endpoint')
     self.connection.client.query = MagicMock(return_value=job)
     self.engine = self.connection.query_engine('test_db', type='presto')
     self.engine._http_get = MagicMock(return_value=MockRequest(job))
Example #10
0
 def setUp(self):
     job = MockJob()
     self.connection = connect('test-key', 'test-endpoint')
     self.connection.client.query = MagicMock(return_value=job)
     self.engine = self.connection.query_engine('test_db', type='presto')
     self.engine._http_get = MagicMock(return_value=MockRequest(job))
Example #11
0
 def setUp(self):
     job = MockJob()
     self.connection = connect("test-key", "test-endpoint")
     self.connection.client.query = MagicMock(return_value=job)
     self.engine = self.connection.query_engine("test_db", type="presto")
     self.engine._http_get = MagicMock(return_value=MockRequest(job))
Example #12
0
import pandas_td as td
import pandas as pd
import tdclient as tdc
import os

apikey = os.environ['MASTER_TD_API_KEY']
con = td.connect(apikey, endpoint='https://api.treasuredata.com/')
#####################################


def gettablename(tabName='tab_1'):
    '''gettablename function validates and returns name of the database
    Input: name of the table
    Output: name of the table'''
    #logging.info("Validate table name entered by user")
    tabFlag = False
    while not tabFlag:
        if tabName != 'tab_1':
            print("wrong value")
            break
        else:
            dbFlag = True
            break
    return tabName


#####################################


def getdefaultcolumnlist(tabName, dbName, qEngine):
    engine = con.query_engine(database=dbName, type=qEngine)
def run(with_aws=True):
    # Original code is published at official document of TensorFlow under Apache License Version 2.0
    # https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub

    import sys
    os.system(f"{sys.executable} -m pip install pandas-td")
    os.system(
        f"{sys.executable} -m pip install tensorflow==1.13.1 tensorflow_hub==0.1.1"
    )

    import tensorflow as tf
    import tensorflow_hub as hub
    import pandas_td as td

    con = td.connect(apikey=os.environ['TD_API_KEY'],
                     endpoint=os.environ['TD_API_SERVER'])
    presto = td.create_engine('presto:sentiment', con=con)

    train_df = td.read_td(
        """
        select
            rowid, sentence, sentiment, polarity
        from
            movie_review_train_shuffled
    """, presto)

    test_df = td.read_td(
        """
        select
            rowid, sentence, sentiment, polarity
        from
            movie_review_test_shuffled
    """, presto)

    # Shuffle has been done by HiveQL in the shuffle task
    # train_df = train_df.sample(frac=1).reset_index(drop=True)

    with tf.Session(graph=tf.Graph()) as sess:
        train_input_fn = tf.estimator.inputs.pandas_input_fn(
            train_df, train_df["polarity"], num_epochs=None, shuffle=True)

        embedded_text_feature_column = hub.text_embedding_column(
            key="sentence",
            module_spec="https://tfhub.dev/google/nnlm-en-dim128/1")

        estimator = tf.estimator.DNNClassifier(
            hidden_units=[500, 100],
            feature_columns=[embedded_text_feature_column],
            n_classes=2,
            optimizer=tf.train.AdamOptimizer(learning_rate=0.003))

        estimator.train(input_fn=train_input_fn, steps=1000)

        # Export TF model to S3
        if with_aws:
            _upload_model(embedded_text_feature_column, estimator)

        predict_train_input_fn = tf.estimator.inputs.pandas_input_fn(
            train_df, train_df["polarity"], shuffle=False)

        predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(
            test_df, test_df["polarity"], shuffle=False)

        train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn)
        test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn)
        print("Training set accuracy: {accuracy}".format(**train_eval_result))
        print("Test set accuracy: {accuracy}".format(**test_eval_result))

        results = get_predictions(estimator, predict_test_input_fn)

    # Store prediction results to Treasure Data

    test_df['predicted_polarity'] = results

    td.to_td(test_df[['rowid', 'predicted_polarity']],
             'sentiment.test_predicted_polarities',
             con=con,
             if_exists='replace',
             index=False)
Example #14
0
# Set API key and start a session
Set your API key to the environment variable TD_API_KEY and run "jupyter notebook":

$ export TD_API_SERVER="https://api.treasuredata.com/"
$ export TD_API_KEY="1234/abcd..."
$ jupyter notebook

con = td.connect(apikey=os.environ['TD_API_KEY'],
                 endpoint=os.environ['TD_API_SERVER'],
                 retry_post_requests=True)
engine = td.create_engine('presto:sample_datasets', con=con)


pip install pandas-td
import pandas_td as td
engine = td.create_engine('presto:sample_datasets')

# Alternatively, initialize a connection explicitly
con = td.connect(apikey=os.environ['TD_API_KEY'], endpoint=os.environ['TD_API_SERVER'])
engine = td.create_engine('presto:sample_datasets', con=con)

# con = td.connect()
with td.connect() as con:
    td.to_td(df, 'my_db.test_table', con, if_exists='replace', index=False)


# Import it into 'tutorial.import1'
con = td.connect()
td.to_td(df, 'tutorial.import1', con, if_exists='replace', index=False)
Example #15
0
 def setUp(self):
     self.connection = connect('test-key', 'test-endpoint')
     self.connection.client = self.mock_client()
     self.frame = pd.DataFrame([[1, 2], [3, 4]], columns=['x', 'y'])
Example #16
0
 def __init__(self, apikey, endpoint, database='sample_datasets'):
     self.conn = td.connect(apikey=apikey, endpoint=endpoint)
     self.database = database
     self.engine = td.create_engine('presto:{}'.format(database), self.conn)
Example #17
0
    def run(self):
        import pandas as pd
        import pandas_td as td
        from sklearn.ensemble import ExtraTreesRegressor
        from sklearn.feature_selection import SelectFromModel

        connection = td.connect(apikey=self.apikey, endpoint=self.endpoint)

        dbname = self.dbname
        source_table = self.source_table

        engine = td.create_engine('presto:{}'.format(dbname), con=connection)

        # Fetch 25% random sampled data
        df = td.read_td(
            """
            select *
            from {} tablesample bernoulli(25)
            """.format(source_table), engine)
        # You can use Hive instead:
        #
        # engine_hive = td.create_engine('hive:{}'.format(dbname), con=connection)
        # df = td.read_td(
        #     """
        #     select *
        #     from {}_train
        #     where rnd < 0.25
        #     """.format(source_table),
        #     engine_hive
        # )
        df = df.drop(columns=['time', 'v', 'rnd', 'rowid'], errors='ignore')

        y = df.medv
        X = df.drop(columns=['medv'])

        categorical_columns = set(['rad', 'chas'])
        quantitative_columns = set(X.columns) - categorical_columns

        reg = ExtraTreesRegressor()
        reg = reg.fit(X, y)

        feature_importances = pd.DataFrame({
            'column':
            X.columns,
            'importance':
            reg.feature_importances_
        })
        td.to_td(feature_importances,
                 'boston.feature_importances',
                 con=connection,
                 if_exists='replace',
                 index=False)

        model = SelectFromModel(reg, prefit=True)

        feature_idx = model.get_support()
        feature_name = df.drop(columns=['medv']).columns[feature_idx]
        selected_features = set(feature_name)

        categorical_columns = set(['rad', 'chas'])
        quantitative_columns = set(X.columns) - categorical_columns

        feature_types = {
            'categorical_columns': categorical_columns,
            'quantitative_columns': quantitative_columns
        }
        feature_query = self._feature_column_query(selected_features,
                                                   feature_types=feature_types)

        # Store query if possible
        try:
            import digdag
            digdag.env.store({'feature_query': feature_query})

        except ImportError:
            pass
Example #18
0
 def connect(self):
     return td.connect()
Example #19
0
 def setUp(self):
     self.connection = connect("test-key", "test-endpoint")
     self.connection.client = self.mock_client()
     self.frame = pd.DataFrame([[1, 2], [3, 4]], columns=["x", "y"])
Example #20
0
def run():
    # Original code is published at official document of TensorFlow under Apache License Version 2.0
    # https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub

    #os.system("pip install pandas-td tensorflow_hub boto3")

    import boto3
    import tensorflow as tf
    import tensorflow_hub as hub
    import pandas_td as td

    con = td.connect(apikey=os.environ['TD_API_KEY'],
                     endpoint=os.environ['TD_API_SERVER'])
    presto = td.create_engine('presto:sentiment', con=con)

    train_df = td.read_td(
        """
        select
            rowid, sentence, sentiment, polarity
        from
            movie_review_train_shuffled
    """, presto)

    test_df = td.read_td(
        """
        select
            rowid, sentence, sentiment, polarity
        from
            movie_review_test_shuffled    
    """, presto)

    # Shuffle has been done by HiveQL in the shuffle task
    # train_df = train_df.sample(frac=1).reset_index(drop=True)

    with tf.Session(graph=tf.Graph()) as sess:
        train_input_fn = tf.estimator.inputs.pandas_input_fn(
            train_df, train_df["polarity"], num_epochs=None, shuffle=True)

        embedded_text_feature_column = hub.text_embedding_column(
            key="sentence",
            module_spec="https://tfhub.dev/google/nnlm-en-dim128/1")

        estimator = tf.estimator.DNNClassifier(
            hidden_units=[500, 100],
            feature_columns=[embedded_text_feature_column],
            n_classes=2,
            optimizer=tf.train.AdamOptimizer(learning_rate=0.003))

        estimator.train(input_fn=train_input_fn, steps=1000)

        # Export TF model on S3
        feature_spec = tf.feature_column.make_parse_example_spec(
            [embedded_text_feature_column])
        serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
            feature_spec)
        estimator.export_saved_model(EXPORT_DIR_BASE,
                                     serving_input_receiver_fn)

        with tarfile.open('tfmodel.tar.gz', 'w:gz') as tar:
            tar.add(EXPORT_DIR_BASE, arcname=os.path.basename(EXPORT_DIR_BASE))

        # Upload the TensorFlow model to S3
        # boto3 assuming environment variables "AWS_ACCESS_KEY_ID" and "AWS_SECRET_ACCESS_KEY":
        # http://boto3.readthedocs.io/en/latest/guide/configuration.html#environment-variables
        s3 = boto3.resource('s3')
        # ACL should be chosen with your purpose
        s3.Bucket(os.environ['S3_BUCKET']).upload_file('tfmodel.tar.gz',
                                                       'tfmodel.tar.gz')

        predict_train_input_fn = tf.estimator.inputs.pandas_input_fn(
            train_df, train_df["polarity"], shuffle=False)

        predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(
            test_df, test_df["polarity"], shuffle=False)

        train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn)
        test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn)
        print("Training set accuracy: {accuracy}".format(**train_eval_result))
        print("Test set accuracy: {accuracy}".format(**test_eval_result))

        results = get_predictions(estimator, predict_test_input_fn)

    # Store prediction results to Treasure Data

    test_df['predicted_polarity'] = results

    td.to_td(test_df[['rowid', 'predicted_polarity']],
             'sentiment.test_predicted_polarities',
             con=con,
             if_exists='replace',
             index=False)