Beispiel #1
0
def write_td_table(database_name, table_name):
    import pandas as pd
    import random
    # TODO TD client, check for table's existence
    engine = td.create_engine(f"presto:{database_name}", con=con)
    df = pd.DataFrame({"c": [random.random() for _ in range(20)]})

    # Manipulating data in Treasure Data via Python.
    # Uses https://github.com/treasure-data/td-client-python

    tdc = tdclient.Client(apikey=os.environ['TD_API_KEY'],
                          endpoint=os.environ['TD_API_SERVER'])

    try:
        tdc.create_database(database_name)
    except tdclient.errors.AlreadyExistsError:
        pass

    try:
        tdc.create_log_table(database_name, table_name)
    except tdclient.errors.AlreadyExistsError:
        pass

    table_path = f"{database_name}.{table_name}"
    td.to_td(df, table_path, con, if_exists='replace', index=False)
Beispiel #2
0
def upload(database, table, project_id, model_id, datasource_id):
    # SetUp
    DR_API_TOKEN = os.environ['DR_API_TOKEN']
    TD_USERNAME = os.environ['TD_USERNAME']
    TD_PASSWORD = os.environ['TD_PASSWORD']
    TD_API_KEY = os.environ['TD_API_KEY']
    TD_API_SERVER = os.environ['TD_API_SERVER']
    MAX_WAIT = 60 * 60  # Maximum number of seconds to wait for prediction job to finish

    dr.Client(endpoint='https://app.datarobot.com/api/v2', token=DR_API_TOKEN)
    project = dr.Project.get(project_id)
    model = dr.Model.get(project_id, model_id)
    dataset = project.upload_dataset_from_data_source(datasource_id,
                                                      TD_USERNAME, TD_PASSWORD)

    # Predict
    pred_job = model.request_predictions(dataset.id)
    pred_df = pred_job.get_result_when_complete(max_wait=MAX_WAIT)

    # Upload
    from pytd import pandas_td as td
    con = td.connect(apikey=TD_API_KEY, endpoint=TD_API_SERVER)
    td.to_td(pred_df,
             '{}.{}'.format(database, table),
             con=con,
             if_exists='replace',
             index=False)
Beispiel #3
0
    def run(self, with_aws=True):
        import pytd.pandas_td as td
        from fbprophet import Prophet

        con = td.connect(apikey=self.apikey, endpoint=self.endpoint)

        engine = td.create_engine('presto:{}'.format(self.dbname), con=con)

        # Note: Prophet requires `ds` column as date string and `y` column as target value
        df = td.read_td(
            """
            select ds, y
            from {}
            where ds between '{}' and '{}'
            """.format(self.source_table, self.start, self.end), engine)

        model = Prophet(seasonality_mode='multiplicative', mcmc_samples=300)
        model.fit(df)
        future = model.make_future_dataframe(periods=self.period)
        forecast = model.predict(future)

        if with_aws:
            self._upload_graph(model, forecast)

        # To avoid TypeError: can't serialize Timestamp, convert `pandas._libs.tslibs.timestamps.Timestamp` to `str`
        forecast.ds = forecast.ds.apply(str)

        # Store prediction results
        td.to_td(forecast,
                 "{}.{}".format(self.dbname, self.target_table),
                 con,
                 if_exists='replace')
Beispiel #4
0
def main(database, date, n_split, n_features, timezone):
    n_split = int(n_split)
    n_features = int(n_features)
    client = pytd.Client(apikey=apikey, endpoint=endpoint)

    result = client.query(f'''
    select
      features, target
    from
      {database}.train
    where
      is_test = 0
  ''')

    target = [r[1] for r in result['data']]
    features = create_sparse_features(result, n_features)

    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(features, target)
    print(target[0], features)

    del target, features
    gc.collect()

    for i in range(n_split):
        result = client.query(f'''
      select
        features, cookie
      from
        {database}.preprocessed
      where
        nth_group = {i}
    ''')
        features = create_sparse_features(result, n_features)
        pred = clf.predict(features)
        cookies = [r[1] for r in result['data']]
        time = int(
            dt.datetime.strptime(date, '%Y-%m-%d').astimezone(
                gettz(timezone)).timestamp())
        pred_df = pd.DataFrame({
            'pred': pred,
            'cookie': cookies
        }).assign(time=time)
        del pred, cookies
        gc.collect()
        td.to_td(pred_df,
                 f'{database}.predicted',
                 con=con,
                 if_exists='append',
                 time_col='time',
                 index=False)
Beispiel #5
0
def get_records(api, basic, org, app_id, database, table, fields, query,
                id_field_code):
    # APIリスト読み込み
    api_list = eval(api)
    # TDへのコネクションを作成
    writer = SparkWriter(
        td_spark_path=TDSparkContextBuilder.default_jar_path())
    con = td.connect(writer=writer)
    # アプリ番号でループ
    for a in api_list:
        # app_idでアプリを指定
        if a["id"] == app_id:
            # kintone APIの設定
            url = f"https://{org}.cybozu.com/k/v1/records.json"
            headers = {"X-Cybozu-API-Token": a["key"], "Authorization": basic}
            payload = {
                "app": 1,
                "query": query,
                "fields": fields,
                "totalCount": "true"
            }
            r = requests.get(url, headers=headers, params=payload)
            count = int(json.loads(r.text)["totalCount"])
            print(count)
            # GETしたデータをキャッシュするdf
            for i in itertools.islice(range(0, count), 0, None, 100):
                splited_query = (query + " order by " + id_field_code +
                                 " asc limit 100 offset " + f"{i}")
                print(splited_query)
                payload = {"app": 1, "query": splited_query, "fields": fields}
                r = requests.get(url, headers=headers, params=payload)
                if r.status_code != 200:
                    sys.exit(1)
                else:
                    data = json.loads(r.text)
                    df = pd.DataFrame.from_dict(data)
                    df = json_normalize(df["records"])
                    df = df.rename(columns=column_encode)
                # KintoneからGETしたアプリID = X のrecordsをTDのTableに格納
                td.to_td(
                    df,
                    ".".join([database, table]),
                    con,
                    if_exists="append",
                    index=False,
                )
Beispiel #6
0
    def run(
        self,
        database="timeseries",
        source_table="retail_sales",
        target_table="predicted_sales",
        start_date="1993-01-01",
        end_date="2016-05-31",
        period=365,
        with_aws=True,
    ):
        import pytd.pandas_td as td
        from fbprophet import Prophet

        period = int(period)

        con = td.connect(apikey=self.apikey, endpoint=self.endpoint)

        engine = td.create_engine(f"presto:{database}", con=con)

        # Note: Prophet requires `ds` column as date string and `y` column as target value
        df = td.read_td(
            f"""
            select ds, y
            from {source_table}
            where ds between '{start_date}' and '{end_date}'
            """,
            engine,
        )

        model = Prophet(seasonality_mode="multiplicative", mcmc_samples=300)
        model.fit(df)
        future = model.make_future_dataframe(periods=period)
        forecast = model.predict(future)

        if with_aws:
            self._upload_graph(model, forecast)

        # To avoid TypeError: can't serialize Timestamp, convert `pandas._libs.tslibs.timestamps.Timestamp` to `str`
        forecast.ds = forecast.ds.apply(str)

        # Store prediction results
        td.to_td(
            forecast, "{}.{}".format(database, target_table), con, if_exists="replace"
        )
def run_batch(
    database, input_table, output_table, device, model, vocab, setup, batchsize=64
):
    def predict_batch(words_batch):
        xs = nlp_utils.transform_to_array(words_batch, vocab, with_label=False)
        xs = nlp_utils.convert_seq(xs, device=device, with_label=False)
        with chainer.using_config("train", False), chainer.no_backprop_mode():
            probs = model.predict(xs, softmax=True)

        # Note: Prediction labels are different from original Chainer example
        #       positive: 1, negative: 0
        answers = model.xp.argmax(probs, axis=1)
        scores = probs[model.xp.arange(answers.size), answers].tolist()

        return answers, scores

    td_api_key = os.environ["TD_API_KEY"]
    endpoint = os.environ["TD_API_SERVER"]

    logger.info("Connect to Treasure Data")

    con = td.connect()
    presto = td.create_engine(f"presto:{database}", con=con)

    logger.info("Fetch data from Treasure Data")
    test_df = td.read_td(
        f"""
        select
            rowid, sentence, sentiment, polarity
        from
            {input_table}
    """,
        presto,
    )

    sentences = test_df["sentence"].tolist()

    logger.info("Start prediction")
    batch = []
    predicted = []
    i = 1
    for sentence in sentences:
        text = nlp_utils.normalize_text(sentence)
        words = nlp_utils.split_text(text, char_based=setup["char_based"])
        batch.append(words)
        if len(batch) >= batchsize:
            _predicted, _ = predict_batch(batch)
            predicted.append(_predicted)
            batch = []
            logger.info(f"Predicted: {i}th batch. batch size {batchsize}")
            i += 1

    if batch:
        _predicted, _ = predict_batch(batch)
        predicted.append(_predicted)

    logger.info("Finish prediction")

    test_df["predicted_polarity"] = numpy.concatenate(predicted, axis=None)

    # Note: Train test split strategy is different from pre trained model and
    #       these tables so that the model includes test data since the model
    #       is trained by Chainer official example.
    #       This accuracy is just for a demo.
    #
    # accuracy = (test_df.polarity == test_df.predicted_polarity).value_counts()[
    #     1
    # ] / len(test_df)
    # print(f"Test set accuracy: {accuracy}")

    con2 = td.connect(apikey=td_api_key, endpoint=endpoint)

    td.to_td(
        test_df[["rowid", "predicted_polarity"]],
        f"{database}.{output_table}",
        con=con2,
        if_exists="replace",
        index=False,
    )

    logger.info("Upload completed")
    def run(self):
        import pandas as pd
        import pytd.pandas_td as td
        from sklearn.ensemble import ExtraTreesRegressor
        from sklearn.feature_selection import SelectFromModel

        connection = td.connect(apikey=self.apikey, endpoint=self.endpoint)

        dbname = self.dbname
        source_table = self.source_table

        engine = td.create_engine('presto:{}'.format(dbname), con=connection)

        # Fetch 25% random sampled data
        df = td.read_td(
            """
            select *
            from {} tablesample bernoulli(25)
            """.format(source_table), engine)
        # You can use Hive instead:
        #
        # engine_hive = td.create_engine('hive:{}'.format(dbname), con=connection)
        # df = td.read_td(
        #     """
        #     select *
        #     from {}_train
        #     where rnd < 0.25
        #     """.format(source_table),
        #     engine_hive
        # )
        df = df.drop(columns=['time', 'v', 'rnd', 'rowid'], errors='ignore')

        y = df.medv
        X = df.drop(columns=['medv'])

        categorical_columns = set(['rad', 'chas'])
        quantitative_columns = set(X.columns) - categorical_columns

        reg = ExtraTreesRegressor()
        reg = reg.fit(X, y)

        feature_importances = pd.DataFrame({
            'column':
            X.columns,
            'importance':
            reg.feature_importances_
        })
        td.to_td(feature_importances,
                 'boston.feature_importances',
                 con=connection,
                 if_exists='replace',
                 index=False)

        model = SelectFromModel(reg, prefit=True)

        feature_idx = model.get_support()
        feature_name = df.drop(columns=['medv']).columns[feature_idx]
        selected_features = set(feature_name)

        categorical_columns = set(['rad', 'chas'])
        quantitative_columns = set(X.columns) - categorical_columns

        feature_types = {
            'categorical_columns': categorical_columns,
            'quantitative_columns': quantitative_columns
        }
        feature_query = self._feature_column_query(selected_features,
                                                   feature_types=feature_types)

        # Store query if possible
        try:
            import digdag
            digdag.env.store({'feature_query': feature_query})

        except ImportError:
            pass
Beispiel #9
0
def run(with_aws=True):
    # Original code is published at official document of TensorFlow under Apache License Version 2.0
    # https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub

    import sys
    os.system(
        f"{sys.executable} -m pip install tensorflow==1.13.1 tensorflow_hub==0.1.1"
    )
    os.system(f"{sys.executable} -m pip install -U pytd==0.6.1")

    import tensorflow as tf
    import tensorflow_hub as hub
    import pytd.pandas_td as td

    con = td.connect(apikey=os.environ['TD_API_KEY'],
                     endpoint=os.environ['TD_API_SERVER'])
    presto = td.create_engine('presto:sentiment', con=con)

    train_df = td.read_td(
        """
        select
            rowid, sentence, sentiment, polarity
        from
            movie_review_train_shuffled
    """, presto)

    test_df = td.read_td(
        """
        select
            rowid, sentence, sentiment, polarity
        from
            movie_review_test_shuffled
    """, presto)

    # Shuffle has been done by HiveQL in the shuffle task
    # train_df = train_df.sample(frac=1).reset_index(drop=True)

    with tf.Session(graph=tf.Graph()) as sess:
        train_input_fn = tf.estimator.inputs.pandas_input_fn(
            train_df, train_df["polarity"], num_epochs=None, shuffle=True)

        embedded_text_feature_column = hub.text_embedding_column(
            key="sentence",
            module_spec="https://tfhub.dev/google/nnlm-en-dim128/1")

        estimator = tf.estimator.DNNClassifier(
            hidden_units=[500, 100],
            feature_columns=[embedded_text_feature_column],
            n_classes=2,
            optimizer=tf.train.AdamOptimizer(learning_rate=0.003))

        estimator.train(input_fn=train_input_fn, steps=1000)

        # Export TF model to S3
        if with_aws:
            _upload_model(embedded_text_feature_column, estimator)

        predict_train_input_fn = tf.estimator.inputs.pandas_input_fn(
            train_df, train_df["polarity"], shuffle=False)

        predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(
            test_df, test_df["polarity"], shuffle=False)

        train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn)
        test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn)
        print("Training set accuracy: {accuracy}".format(**train_eval_result))
        print("Test set accuracy: {accuracy}".format(**test_eval_result))

        results = get_predictions(estimator, predict_test_input_fn)

    # Store prediction results to Treasure Data

    test_df['predicted_polarity'] = results

    td.to_td(test_df[['rowid', 'predicted_polarity']],
             'sentiment.test_predicted_polarities',
             con=con,
             if_exists='replace',
             index=False)
Beispiel #10
0
    def run(self, dbname="boston", source_table="house_prices"):
        import pandas as pd
        import pytd.pandas_td as td
        from sklearn.ensemble import ExtraTreesRegressor
        from sklearn.feature_selection import SelectFromModel

        connection = td.connect(apikey=self.apikey, endpoint=self.endpoint)
        engine = td.create_engine(f"presto:{dbname}", con=connection)

        # Fetch 25% random sampled data
        df = td.read_td(
            f"""
            select *
            from {source_table} tablesample bernoulli(25)
            """,
            engine,
        )
        # You can use Hive instead:
        #
        # engine_hive = td.create_engine(f'hive:{dbname}', con=connection)
        # df = td.read_td(
        #     """
        #     select *
        #     from {}_train
        #     where rnd < 0.25
        #     """.format(source_table),
        #     engine_hive
        # )
        df = df.drop(columns=["time", "v", "rnd", "rowid"], errors="ignore")

        y = df.medv
        X = df.drop(columns=["medv"])

        categorical_columns = set(["rad", "chas"])
        quantitative_columns = set(X.columns) - categorical_columns

        reg = ExtraTreesRegressor()
        reg = reg.fit(X, y)

        feature_importances = pd.DataFrame({
            "column":
            X.columns,
            "importance":
            reg.feature_importances_
        })
        td.to_td(
            feature_importances,
            f"{dbname}.feature_importances",
            con=connection,
            if_exists="replace",
            index=False,
        )

        model = SelectFromModel(reg, prefit=True)

        feature_idx = model.get_support()
        feature_name = df.drop(columns=["medv"]).columns[feature_idx]
        selected_features = set(feature_name)

        categorical_columns = set(["rad", "chas"])
        quantitative_columns = set(X.columns) - categorical_columns

        feature_types = {
            "categorical_columns": categorical_columns,
            "quantitative_columns": quantitative_columns,
        }
        feature_query = self._feature_column_query(selected_features,
                                                   feature_types=feature_types)

        # Store query if possible
        try:
            import digdag

            digdag.env.store({"feature_query": feature_query})

        except ImportError:
            pass