Esempio n. 1
0
    def run(self, with_aws=True):
        import pytd.pandas_td as td
        from fbprophet import Prophet

        con = td.connect(apikey=self.apikey, endpoint=self.endpoint)

        engine = td.create_engine('presto:{}'.format(self.dbname), con=con)

        # Note: Prophet requires `ds` column as date string and `y` column as target value
        df = td.read_td(
            """
            select ds, y
            from {}
            where ds between '{}' and '{}'
            """.format(self.source_table, self.start, self.end), engine)

        model = Prophet(seasonality_mode='multiplicative', mcmc_samples=300)
        model.fit(df)
        future = model.make_future_dataframe(periods=self.period)
        forecast = model.predict(future)

        if with_aws:
            self._upload_graph(model, forecast)

        # To avoid TypeError: can't serialize Timestamp, convert `pandas._libs.tslibs.timestamps.Timestamp` to `str`
        forecast.ds = forecast.ds.apply(str)

        # Store prediction results
        td.to_td(forecast,
                 "{}.{}".format(self.dbname, self.target_table),
                 con,
                 if_exists='replace')
Esempio n. 2
0
def write_td_table(database_name, table_name):
    import pandas as pd
    import random
    # TODO TD client, check for table's existence
    engine = td.create_engine(f"presto:{database_name}", con=con)
    df = pd.DataFrame({"c": [random.random() for _ in range(20)]})

    # Manipulating data in Treasure Data via Python.
    # Uses https://github.com/treasure-data/td-client-python

    tdc = tdclient.Client(apikey=os.environ['TD_API_KEY'],
                          endpoint=os.environ['TD_API_SERVER'])

    try:
        tdc.create_database(database_name)
    except tdclient.errors.AlreadyExistsError:
        pass

    try:
        tdc.create_log_table(database_name, table_name)
    except tdclient.errors.AlreadyExistsError:
        pass

    table_path = f"{database_name}.{table_name}"
    td.to_td(df, table_path, con, if_exists='replace', index=False)
Esempio n. 3
0
def put_records(api, basic, org, app_id, c, content_type, database, query):
    # APIリスト読み込み
    api_list = eval(api)
    for a in api_list:
        # app_idでアプリを指定
        if a["id"] == app_id:
            # クエリエンジン起動
            engine = td.create_engine(f"presto:{database}")
            c = eval(c)
            c = [d["name"] for d in c]
            # TDのデータを取得
            df = td.read_td(query, engine)
            df = df.rename(columns=column_decode)
            df = df[[
                s for s in list(df.columns) if re.match(".*?(value)\Z", s)
            ]].rename(columns=lambda s: s.replace(".value", ""))
            df = df[c]
            l_records = df.to_dict(orient="records")
            bucket = 100
            splited_records = [
                l_records[idx:idx + bucket]
                for idx in range(0, len(l_records), bucket)
            ]
            id_column = c.pop(0)
            for r in splited_records:
                payload = ""
                for s in r:
                    r_id = '{"id": ' + f"{s[id_column]}" + ', "record": {'
                    kv = "".join([
                        '"' + f"{k}" + '": {"value": "' + f"{s[k]}" +
                        '"},' if k in c else "" for k in s
                    ])
                    record = r_id + kv[:-1] + "}},"
                    payload = payload + record
                payload = '{"app":' + f"{app_id}" + ',"records":[' + payload[:
                                                                             -1] + "]}"
                data = json.loads(payload)
                url = f"https://{org}.cybozu.com/k/v1/records.json"
                headers = {
                    "X-Cybozu-API-Token": a["key"],
                    "Authorization": basic,
                    "Content-Type": content_type,
                }
                r = requests.put(url,
                                 data=payload.encode("utf-8"),
                                 headers=headers)
                if r.status_code != 200:
                    sys.exit(1)
        else:
            break
Esempio n. 4
0
    def run(
        self,
        database="timeseries",
        source_table="retail_sales",
        target_table="predicted_sales",
        start_date="1993-01-01",
        end_date="2016-05-31",
        period=365,
        with_aws=True,
    ):
        import pytd.pandas_td as td
        from fbprophet import Prophet

        period = int(period)

        con = td.connect(apikey=self.apikey, endpoint=self.endpoint)

        engine = td.create_engine(f"presto:{database}", con=con)

        # Note: Prophet requires `ds` column as date string and `y` column as target value
        df = td.read_td(
            f"""
            select ds, y
            from {source_table}
            where ds between '{start_date}' and '{end_date}'
            """,
            engine,
        )

        model = Prophet(seasonality_mode="multiplicative", mcmc_samples=300)
        model.fit(df)
        future = model.make_future_dataframe(periods=period)
        forecast = model.predict(future)

        if with_aws:
            self._upload_graph(model, forecast)

        # To avoid TypeError: can't serialize Timestamp, convert `pandas._libs.tslibs.timestamps.Timestamp` to `str`
        forecast.ds = forecast.ds.apply(str)

        # Store prediction results
        td.to_td(
            forecast, "{}.{}".format(database, target_table), con, if_exists="replace"
        )
Esempio n. 5
0
def read_td_table(database_name, table_name, engine_name='presto', limit=1000):
    engine = td.create_engine(f"{engine_name}:{database_name}", con=con)
    df = td.read_td(f'SELECT * FROM {table_name} LIMIT {limit}', engine)
    print(df)
def run_batch(
    database, input_table, output_table, device, model, vocab, setup, batchsize=64
):
    def predict_batch(words_batch):
        xs = nlp_utils.transform_to_array(words_batch, vocab, with_label=False)
        xs = nlp_utils.convert_seq(xs, device=device, with_label=False)
        with chainer.using_config("train", False), chainer.no_backprop_mode():
            probs = model.predict(xs, softmax=True)

        # Note: Prediction labels are different from original Chainer example
        #       positive: 1, negative: 0
        answers = model.xp.argmax(probs, axis=1)
        scores = probs[model.xp.arange(answers.size), answers].tolist()

        return answers, scores

    td_api_key = os.environ["TD_API_KEY"]
    endpoint = os.environ["TD_API_SERVER"]

    logger.info("Connect to Treasure Data")

    con = td.connect()
    presto = td.create_engine(f"presto:{database}", con=con)

    logger.info("Fetch data from Treasure Data")
    test_df = td.read_td(
        f"""
        select
            rowid, sentence, sentiment, polarity
        from
            {input_table}
    """,
        presto,
    )

    sentences = test_df["sentence"].tolist()

    logger.info("Start prediction")
    batch = []
    predicted = []
    i = 1
    for sentence in sentences:
        text = nlp_utils.normalize_text(sentence)
        words = nlp_utils.split_text(text, char_based=setup["char_based"])
        batch.append(words)
        if len(batch) >= batchsize:
            _predicted, _ = predict_batch(batch)
            predicted.append(_predicted)
            batch = []
            logger.info(f"Predicted: {i}th batch. batch size {batchsize}")
            i += 1

    if batch:
        _predicted, _ = predict_batch(batch)
        predicted.append(_predicted)

    logger.info("Finish prediction")

    test_df["predicted_polarity"] = numpy.concatenate(predicted, axis=None)

    # Note: Train test split strategy is different from pre trained model and
    #       these tables so that the model includes test data since the model
    #       is trained by Chainer official example.
    #       This accuracy is just for a demo.
    #
    # accuracy = (test_df.polarity == test_df.predicted_polarity).value_counts()[
    #     1
    # ] / len(test_df)
    # print(f"Test set accuracy: {accuracy}")

    con2 = td.connect(apikey=td_api_key, endpoint=endpoint)

    td.to_td(
        test_df[["rowid", "predicted_polarity"]],
        f"{database}.{output_table}",
        con=con2,
        if_exists="replace",
        index=False,
    )

    logger.info("Upload completed")
Esempio n. 7
0
    def run(self):
        import pandas as pd
        import pytd.pandas_td as td
        from sklearn.ensemble import ExtraTreesRegressor
        from sklearn.feature_selection import SelectFromModel

        connection = td.connect(apikey=self.apikey, endpoint=self.endpoint)

        dbname = self.dbname
        source_table = self.source_table

        engine = td.create_engine('presto:{}'.format(dbname), con=connection)

        # Fetch 25% random sampled data
        df = td.read_td(
            """
            select *
            from {} tablesample bernoulli(25)
            """.format(source_table), engine)
        # You can use Hive instead:
        #
        # engine_hive = td.create_engine('hive:{}'.format(dbname), con=connection)
        # df = td.read_td(
        #     """
        #     select *
        #     from {}_train
        #     where rnd < 0.25
        #     """.format(source_table),
        #     engine_hive
        # )
        df = df.drop(columns=['time', 'v', 'rnd', 'rowid'], errors='ignore')

        y = df.medv
        X = df.drop(columns=['medv'])

        categorical_columns = set(['rad', 'chas'])
        quantitative_columns = set(X.columns) - categorical_columns

        reg = ExtraTreesRegressor()
        reg = reg.fit(X, y)

        feature_importances = pd.DataFrame({
            'column':
            X.columns,
            'importance':
            reg.feature_importances_
        })
        td.to_td(feature_importances,
                 'boston.feature_importances',
                 con=connection,
                 if_exists='replace',
                 index=False)

        model = SelectFromModel(reg, prefit=True)

        feature_idx = model.get_support()
        feature_name = df.drop(columns=['medv']).columns[feature_idx]
        selected_features = set(feature_name)

        categorical_columns = set(['rad', 'chas'])
        quantitative_columns = set(X.columns) - categorical_columns

        feature_types = {
            'categorical_columns': categorical_columns,
            'quantitative_columns': quantitative_columns
        }
        feature_query = self._feature_column_query(selected_features,
                                                   feature_types=feature_types)

        # Store query if possible
        try:
            import digdag
            digdag.env.store({'feature_query': feature_query})

        except ImportError:
            pass
Esempio n. 8
0
def run(with_aws=True):
    # Original code is published at official document of TensorFlow under Apache License Version 2.0
    # https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub

    import sys
    os.system(
        f"{sys.executable} -m pip install tensorflow==1.13.1 tensorflow_hub==0.1.1"
    )
    os.system(f"{sys.executable} -m pip install -U pytd==0.6.1")

    import tensorflow as tf
    import tensorflow_hub as hub
    import pytd.pandas_td as td

    con = td.connect(apikey=os.environ['TD_API_KEY'],
                     endpoint=os.environ['TD_API_SERVER'])
    presto = td.create_engine('presto:sentiment', con=con)

    train_df = td.read_td(
        """
        select
            rowid, sentence, sentiment, polarity
        from
            movie_review_train_shuffled
    """, presto)

    test_df = td.read_td(
        """
        select
            rowid, sentence, sentiment, polarity
        from
            movie_review_test_shuffled
    """, presto)

    # Shuffle has been done by HiveQL in the shuffle task
    # train_df = train_df.sample(frac=1).reset_index(drop=True)

    with tf.Session(graph=tf.Graph()) as sess:
        train_input_fn = tf.estimator.inputs.pandas_input_fn(
            train_df, train_df["polarity"], num_epochs=None, shuffle=True)

        embedded_text_feature_column = hub.text_embedding_column(
            key="sentence",
            module_spec="https://tfhub.dev/google/nnlm-en-dim128/1")

        estimator = tf.estimator.DNNClassifier(
            hidden_units=[500, 100],
            feature_columns=[embedded_text_feature_column],
            n_classes=2,
            optimizer=tf.train.AdamOptimizer(learning_rate=0.003))

        estimator.train(input_fn=train_input_fn, steps=1000)

        # Export TF model to S3
        if with_aws:
            _upload_model(embedded_text_feature_column, estimator)

        predict_train_input_fn = tf.estimator.inputs.pandas_input_fn(
            train_df, train_df["polarity"], shuffle=False)

        predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(
            test_df, test_df["polarity"], shuffle=False)

        train_eval_result = estimator.evaluate(input_fn=predict_train_input_fn)
        test_eval_result = estimator.evaluate(input_fn=predict_test_input_fn)
        print("Training set accuracy: {accuracy}".format(**train_eval_result))
        print("Test set accuracy: {accuracy}".format(**test_eval_result))

        results = get_predictions(estimator, predict_test_input_fn)

    # Store prediction results to Treasure Data

    test_df['predicted_polarity'] = results

    td.to_td(test_df[['rowid', 'predicted_polarity']],
             'sentiment.test_predicted_polarities',
             con=con,
             if_exists='replace',
             index=False)
Esempio n. 9
0
    def run(self, dbname="boston", source_table="house_prices"):
        import pandas as pd
        import pytd.pandas_td as td
        from sklearn.ensemble import ExtraTreesRegressor
        from sklearn.feature_selection import SelectFromModel

        connection = td.connect(apikey=self.apikey, endpoint=self.endpoint)
        engine = td.create_engine(f"presto:{dbname}", con=connection)

        # Fetch 25% random sampled data
        df = td.read_td(
            f"""
            select *
            from {source_table} tablesample bernoulli(25)
            """,
            engine,
        )
        # You can use Hive instead:
        #
        # engine_hive = td.create_engine(f'hive:{dbname}', con=connection)
        # df = td.read_td(
        #     """
        #     select *
        #     from {}_train
        #     where rnd < 0.25
        #     """.format(source_table),
        #     engine_hive
        # )
        df = df.drop(columns=["time", "v", "rnd", "rowid"], errors="ignore")

        y = df.medv
        X = df.drop(columns=["medv"])

        categorical_columns = set(["rad", "chas"])
        quantitative_columns = set(X.columns) - categorical_columns

        reg = ExtraTreesRegressor()
        reg = reg.fit(X, y)

        feature_importances = pd.DataFrame({
            "column":
            X.columns,
            "importance":
            reg.feature_importances_
        })
        td.to_td(
            feature_importances,
            f"{dbname}.feature_importances",
            con=connection,
            if_exists="replace",
            index=False,
        )

        model = SelectFromModel(reg, prefit=True)

        feature_idx = model.get_support()
        feature_name = df.drop(columns=["medv"]).columns[feature_idx]
        selected_features = set(feature_name)

        categorical_columns = set(["rad", "chas"])
        quantitative_columns = set(X.columns) - categorical_columns

        feature_types = {
            "categorical_columns": categorical_columns,
            "quantitative_columns": quantitative_columns,
        }
        feature_query = self._feature_column_query(selected_features,
                                                   feature_types=feature_types)

        # Store query if possible
        try:
            import digdag

            digdag.env.store({"feature_query": feature_query})

        except ImportError:
            pass