Exemple #1
0
def bulk_load(data):
    dataframe = pandas.DataFrame(columns=[
        'time', 'job_id', 'type', 'query', 'status', 'created_at', 'start_at',
        'org_name', 'database', 'user_name'
    ])
    for item in data:
        record = pandas.Series([
            item['time'], item['job_id'], item['type'], item['query'],
            item['status'], item['created_at'], item['start_at'],
            item['org_name'], item['database'], item['user_name']
        ],
                               index=dataframe.columns)
        dataframe = dataframe.append(record, ignore_index=True)

    jar_path = TDSparkContextBuilder.default_jar_path()
    writer = SparkWriter(apikey=TD_API_KEY,
                         endpoint=TD_API_SERVER,
                         td_spark_path=jar_path)
    with pytd.Client(apikey=TD_API_KEY,
                     endpoint=TD_API_SERVER,
                     database=TD_DATABASE,
                     writer=writer) as client:
        client.load_table_from_dataframe(dataframe,
                                         TD_TABLE,
                                         if_exists='append')
def _prepare_td_spark() -> TDSparkContext:
    """
    Create SparkSession with local mode setting td-spark specific configurations.
    :return: TDSparkContext
    """

    apikey = os.environ["TD_API_KEY"]
    endpoint = os.environ["TD_API_SERVER"]

    site = "us"
    if ".co.jp" in endpoint:
        site = "jp"
    elif "eu01" in endpoint:
        site = "eu01"

    builder = SparkSession.builder.appName("spark_als")
    td = (TDSparkContextBuilder(builder).apikey(apikey).site(site).jars(
        TDSparkContextBuilder.default_jar_path()).build())

    return td
def rss_import(dest_db: str, dest_table: str, rss_url_list):
    df = pd.DataFrame( columns=['title','description','link'] )
    ts = str(int(time.time()))
    for rss_url in rss_url_list:
        d = feedparser.parse(rss_url)
        for entry in d.entries:
            tmp_se = pd.Series( [ entry.title, entry.description, entry.link ], index=df.columns )
            df = df.append( tmp_se, ignore_index=True )
    #print(df)
    jar_path = TDSparkContextBuilder.default_jar_path()
    writer = SparkWriter(apikey=TD_APIKEY, endpoint=TD_ENDPOINT, td_spark_path=jar_path)
    client = pytd.Client(apikey=TD_APIKEY, endpoint=TD_ENDPOINT, database=dest_db, writer=writer, engine='presto')
    client.load_table_from_dataframe(df, dest_table, if_exists='append')
Exemple #4
0
def get_records(api, basic, org, app_id, database, table, fields, query,
                id_field_code):
    # APIリスト読み込み
    api_list = eval(api)
    # TDへのコネクションを作成
    writer = SparkWriter(
        td_spark_path=TDSparkContextBuilder.default_jar_path())
    con = td.connect(writer=writer)
    # アプリ番号でループ
    for a in api_list:
        # app_idでアプリを指定
        if a["id"] == app_id:
            # kintone APIの設定
            url = f"https://{org}.cybozu.com/k/v1/records.json"
            headers = {"X-Cybozu-API-Token": a["key"], "Authorization": basic}
            payload = {
                "app": 1,
                "query": query,
                "fields": fields,
                "totalCount": "true"
            }
            r = requests.get(url, headers=headers, params=payload)
            count = int(json.loads(r.text)["totalCount"])
            print(count)
            # GETしたデータをキャッシュするdf
            for i in itertools.islice(range(0, count), 0, None, 100):
                splited_query = (query + " order by " + id_field_code +
                                 " asc limit 100 offset " + f"{i}")
                print(splited_query)
                payload = {"app": 1, "query": splited_query, "fields": fields}
                r = requests.get(url, headers=headers, params=payload)
                if r.status_code != 200:
                    sys.exit(1)
                else:
                    data = json.loads(r.text)
                    df = pd.DataFrame.from_dict(data)
                    df = json_normalize(df["records"])
                    df = df.rename(columns=column_encode)
                # KintoneからGETしたアプリID = X のrecordsをTDのTableに格納
                td.to_td(
                    df,
                    ".".join([database, table]),
                    con,
                    if_exists="append",
                    index=False,
                )
def bulk_load(data):
    dataframe = pandas.DataFrame(columns=mp.keys())
    for item in data:
        record = pandas.Series(list(item.values()), index=dataframe.columns)
        dataframe = dataframe.append(record, ignore_index=True)

    jar_path = TDSparkContextBuilder.default_jar_path()
    writer = SparkWriter(apikey=TD_API_KEY,
                         endpoint=TD_API_SERVER,
                         td_spark_path=jar_path)
    with pytd.Client(apikey=TD_API_KEY,
                     endpoint=TD_API_SERVER,
                     database=TD_DATABASE,
                     writer=writer) as client:
        client.load_table_from_dataframe(dataframe,
                                         TD_TABLE,
                                         if_exists='append')
Exemple #6
0
def get_row_count(dest_db: str, dest_table: str):
    df = pd.DataFrame(columns=['db_name', 'table_name', 'row_count'])
    jar_path = TDSparkContextBuilder.default_jar_path()
    writer = SparkWriter(apikey=TD_APIKEY,
                         endpoint=TD_ENDPOINT,
                         td_spark_path=jar_path)
    client = pytd.Client(apikey=TD_APIKEY,
                         endpoint=TD_ENDPOINT,
                         database=dest_db,
                         writer=writer,
                         engine='presto')
    for db in client.list_databases():
        for table in client.list_tables(db.name):
            tmp_se = pd.Series([db.name, table.name, table.count],
                               index=df.columns)
            df = df.append(tmp_se, ignore_index=True)
            #print(db.name + ',' + table.name + ',' + str(table.count))
    #print(df)
    client.load_table_from_dataframe(df, dest_table, if_exists='append')
Exemple #7
0
def fetch_td_spark_context(
    apikey=None,
    endpoint=None,
    td_spark_path=None,
    download_if_missing=True,
    spark_configs=None,
):
    """Build TDSparkContext via td-pyspark.

    Parameters
    ----------
    apikey : str, optional
        Treasure Data API key. If not given, a value of environment variable
        ``TD_API_KEY`` is used by default.

    endpoint : str, optional
        Treasure Data API server. If not given, ``https://api.treasuredata.com`` is
        used by default. List of available endpoints is:
        https://tddocs.atlassian.net/wiki/spaces/PD/pages/1085143/Sites+and+Endpoints

    td_spark_path : str, optional
        Path to td-spark-assembly-{td-spark-version}_spark{spark-version}.jar.
        If not given, seek a path
        ``TDSparkContextBuilder.default_jar_path()`` by default.

    download_if_missing : bool, default: True
        Download td-spark if it does not exist at the time of initialization.

    spark_configs : dict, optional
        Additional Spark configurations to be set via ``SparkConf``'s ``set`` method.

    Returns
    -------
    :class:`td_pyspark.TDSparkContext`
        Connection of td-spark
    """
    try:
        import td_pyspark
        from pyspark.conf import SparkConf
        from pyspark.sql import SparkSession
        from td_pyspark import TDSparkContextBuilder
    except ImportError:
        raise RuntimeError("td_pyspark is not installed")

    apikey = apikey or os.environ.get("TD_API_KEY")
    if apikey is None:
        raise ValueError("either argument 'apikey' or environment variable"
                         "'TD_API_KEY' should be set")
    if endpoint is None:
        endpoint = os.getenv("TD_API_SERVER", "https://api.treasuredata.com")

    conf = (SparkConf().setMaster("local[*]").set(
        "spark.serializer", "org.apache.spark.serializer.KryoSerializer").set(
            "spark.sql.execution.arrow.pyspark.enabled", "true"))
    if isinstance(spark_configs, dict):
        for k, v in spark_configs.items():
            conf.set(k, v)
    builder = TDSparkContextBuilder(SparkSession.builder.config(conf=conf))

    builder.apikey(apikey)

    if td_spark_path is None:
        td_spark_path = TDSparkContextBuilder.default_jar_path()
    else:
        td_spark_path = os.path.expanduser(td_spark_path)

    available = os.path.exists(td_spark_path)

    if not available and download_if_missing:
        download_td_spark(version=td_pyspark.__version__,
                          destination=td_spark_path)
    elif not available:
        raise IOError(
            "td-spark is not found and `download_if_missing` is False")

    builder.jars(td_spark_path)

    plazma_api = os.getenv("TD_PLAZMA_API")
    presto_api = os.getenv("TD_PRESTO_API")

    if plazma_api and presto_api:
        api_regex = re.compile(r"(?:https?://)?(api(?:-.+?)?)\.")
        builder.api_endpoint(api_regex.sub("\\1.", endpoint).strip("/"))
        builder.plazma_endpoint(plazma_api)
        builder.presto_endpoint(presto_api)

    site = "us"
    if ".co.jp" in endpoint:
        site = "jp"
    if "eu01" in endpoint:
        site = "eu01"
    if "ap02" in endpoint:
        site = "ap02"
    builder.site(site)

    try:
        return builder.build()
    except Exception as e:
        raise RuntimeError("failed to connect to td-spark: " + str(e))
def run_batch(database,
              input_table,
              output_table,
              device,
              model,
              vocab,
              setup,
              batchsize=64):
    def predict_batch(words_batch):
        xs = nlp_utils.transform_to_array(words_batch, vocab, with_label=False)
        xs = nlp_utils.convert_seq(xs, device=device, with_label=False)
        with chainer.using_config("train", False), chainer.no_backprop_mode():
            probs = model.predict(xs, softmax=True)

        # Note: Prediction labels are different from original Chainer example
        #       positive: 1, negative: 0
        answers = model.xp.argmax(probs, axis=1)
        scores = probs[model.xp.arange(answers.size), answers].tolist()

        return answers, scores

    td_api_key = os.environ["TD_API_KEY"]
    endpoint = os.environ["TD_API_SERVER"]
    jar_path = TDSparkContextBuilder.default_jar_path()

    logger.info("Connect to Treasure Data")

    con = td.connect()
    presto = td.create_engine(f"presto:{database}", con=con)

    logger.info("Fetch data from Treasure Data")
    test_df = td.read_td(
        f"""
        select
            rowid, sentence, sentiment, polarity
        from
            {input_table}
    """,
        presto,
    )

    sentences = test_df["sentence"].tolist()

    logger.info("Start prediction")
    batch = []
    predicted = []
    i = 1
    for sentence in sentences:
        text = nlp_utils.normalize_text(sentence)
        words = nlp_utils.split_text(text, char_based=setup["char_based"])
        batch.append(words)
        if len(batch) >= batchsize:
            _predicted, _ = predict_batch(batch)
            predicted.append(_predicted)
            batch = []
            logger.info(f"Predicted: {i}th batch. batch size {batchsize}")
            i += 1

    if batch:
        _predicted, _ = predict_batch(batch)
        predicted.append(_predicted)

    logger.info("Finish prediction")

    test_df["predicted_polarity"] = numpy.concatenate(predicted, axis=None)

    # Note: Train test split strategy is different from pre trained model and
    #       these tables so that the model includes test data since the model
    #       is trained by Chainer official example.
    #       This accuracy is just for a demo.
    #
    # accuracy = (test_df.polarity == test_df.predicted_polarity).value_counts()[
    #     1
    # ] / len(test_df)
    # print(f"Test set accuracy: {accuracy}")

    writer = SparkWriter(apikey=td_api_key,
                         endpoint=endpoint,
                         td_spark_path=jar_path)
    con2 = td.connect(apikey=td_api_key, endpoint=endpoint, writer=writer)

    td.to_td(
        test_df[["rowid", "predicted_polarity"]],
        f"{database}.{output_table}",
        con=con2,
        if_exists="replace",
        index=False,
    )

    logger.info("Upload completed")