def upload_dataset(database, table): client = pytd.Client( database=database, apikey=os.environ["TD_API_KEY"], endpoint=os.environ["TD_API_SERVER"], ) if client.exists(database, table): print("Target database and table exist. Skip") return True target_url = "http://files.grouplens.org/datasets/movielens/ml-1m.zip" fname = os.path.join("ml-1m", "ratings.dat") with urlopen(target_url) as res: info = res.info() if info.get_content_type() != "application/zip": raise IOError("Target file isn't Zip file. Might be corrupt") with zipfile.ZipFile(io.BytesIO(res.read())) as zipf: zipf.extract(fname) df = pd.read_csv( fname, sep="::", engine="python", names=["userid", "itemid", "rating", "timestamp"], ) client.create_database_if_not_exists(database) client.load_table_from_dataframe(df, f"{database}.{table}", if_exists="overwrite") shutil.rmtree("ml-1m")
def read_results(limit, bundles_filter): # Read PII from DS-API output table and print pipe delimited in stdout global database_name, dest_table, global_db_client try: global_db_client = pytd.Client(database=database_name) bundle_where = "" if bundles_filter: for bundle in bundles_filter.split(","): if bundle_where == "": bundle_where = bundle_where + " where key like '%." + bundle + "%' " else: bundle_where = bundle_where + " or key like '%." + bundle + "%' " sql = "select * from " + dest_table + bundle_where + " limit " + str( limit) print(sql) res = global_db_client.query(sql) print("ID|KEY|VALUE") for record in res["data"]: print(str(record[0]) + "|" + record[1] + "|" + record[2]) except Exception as e: raise Exception("Exception reading source table - " + str(e)) finally: return
def main(): # Create a TD client instance. client = pytd.Client(apikey=TD_API_KEY, endpoint=TD_API_SERVER, database=TD_DATABASE) # Retrieves the details for each job. cursor = 0 keys = jobinfo.keys() while cursor is not None: result = cuenote.call_api("delivery", { "limit": "50", "cursor": str(cursor) }) jobs = result.json() for job in jobs["list"]: for key in keys: if key in job: jobinfo[key] += [cuenote.format_value(key, job[key])] else: jobinfo[key] += "" cursor = jobs["next_cursor"] df_jobinfo = pandas.DataFrame(jobinfo.values(), index=jobinfo.keys()).T # Refresh Job Info table. client.load_table_from_dataframe(df_jobinfo, "jobinfo", writer="bulk_import", if_exists="overwrite")
def run(database, table): apikey = os.environ["TD_API_KEY"] apiserver = os.environ["TD_API_SERVER"] client = pytd.Client(apikey=apikey, endpoint=apiserver) # Write your execution code return True
def import_table(database, table): import pandas as pd import pytd # http://archive.ics.uci.edu/ml/datasets/Online+Retail# url = "http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx" df = pd.read_excel(url, dtype={ "InvoiceNo": str, "InvoiceDate": str, "CustomerID": float, "Country": str, "StockCode": str, "Description": str, "UnitPrice": float, "Quantity": int }) df = df.loc[pd.notnull(df.CustomerID)] df.CustomerID = df.CustomerID.astype(int) client = pytd.Client(apikey=os.environ.get('TD_API_KEY'), endpoint=os.environ.get('TD_API_SERVER'), database=database) client.load_table_from_dataframe(df, table, if_exists="overwrite")
def call_api(sql, url, replaced_param): print("Reading sqlfile: {}".format(sql)) f = open(sql, 'r') sqlcontent = f.read() f.close print("SQL content is:") print(sqlcontent) client = pytd.Client( apikey=os.environ.get('td_apikey'), endpoint=os.environ.get('td_endpoint'), database=os.environ.get('td_database'), default_engine=os.environ.get('td_engine') ) conn = connect(client) print("API call start:") for index, row in fetch_row(sqlcontent, conn): u = url.replace(replaced_param, row) print(index, u) r = requests.get(u) print(r) print("API call finished")
def upload(sqlfile, database, presigned_url): with open(sqlfile) as f: querytxt = f.read() client = pytd.Client(apikey=os.getenv("td_apikey"), database=database) res = client.query(querytxt) df = pd.DataFrame(**res) csv = df.to_csv(header=False, index=False, sep="\t") logger.info("---- user list as first 10 lines----") logger.info("\n".join(csv.splitlines()[:10])) logger.info("---- Total number of IDs = " + str(len(csv.splitlines())) + "----") res = requests.put( presigned_url, data=gzip.compress(bytes(csv, "utf-8")), headers={"Content-Encoding": "gzip"}, ) if res.status_code != 200: logger.error( f"Failed to call Yahoo API with http status code {res.status_code}" ) logger.error(res.text) sys.exit(os.EX_DATAERR) else: logger.info( f"Succeeded calling Yahoo API with http status code {res.status_code}" )
def __init__( self, api_key, api_server, database, user_size, non_null_rate, verbose, overwrite, dry_run, ): self.client = pytd.Client(apikey=api_key, endpoint=api_server, database=database) self.database = database self.overwrite = overwrite self.dry_run = dry_run if user_size < 100 or user_size > self.MAX_ADJUSTMENT_COEF: raise ValueError(f"user_size should be between 100 to 100000000") self.user_size = user_size self.non_null_rate = non_null_rate if verbose > 2: verbose = 2 levels = {0: logging.WARN, 1: logging.INFO, 2: logging.DEBUG} self.logger = logging.getLogger(__name__) ch = logging.StreamHandler() self.logger.setLevel(levels[verbose]) ch.setLevel(levels[verbose]) self.logger.addHandler(ch)
def rss_import(dest_db: str, dest_table: str, rss_url_list): import feedparser import pytd import pandas as pd posts = [] # Fetch RSS data for each URL in `rss_url_list` for rss_url in rss_url_list: # Parse RSS into Python dictionary d = feedparser.parse(rss_url) # Get title, description, and link of each entry in an RSS for entry in d.entries: # You can modify this line to get other fields e.g., summary posts.append((entry.title, entry.description, entry.link)) # Create pandas DataFrame of posts # If you want to add other fields, you need to modify `columns` argument df = pd.DataFrame(posts, columns=["title", "description", "link"]) # Create pytd client client = pytd.Client(apikey=apikey, endpoint=apiserver, database=dest_db) # Create `dest_db` database if not exists client.create_database_if_not_exists(dest_db) # Upload and append the dataframe to TD as `dest_db`.`dest_table` client.load_table_from_dataframe(df, dest_table, if_exists="append")
def predict(self, data_id, model_id, dest_database, dest_table): import json, requests, digdag import pandas as pd import pytd token = digdag.env.params['token'] headers = { 'skyfox-api-access-token': token, 'Content-Type': 'application/json', } data = json.dumps({ "dataId": data_id, "deleteData": False, "options": { "withInput": True } }) response = requests.post(f'https://{skyfox_endpoint}/models/{model_id}/predictions', headers=headers, data=data) prediction_id = json.loads(response.text)['id'] assert response.ok, f"Status Code : {response.status_code} | {response.text}" response = self._check_completed_predicting(model_id, prediction_id) results = pd.DataFrame(json.loads(response.text)['results']) client = pytd.Client(apikey=td_apikey, endpoint=td_endpoint) client.load_table_from_dataframe(results.assign(time=session_unixtime), f'{dest_database}.{dest_table}', if_exists='append') return True
def bulk_load(data): dataframe = pandas.DataFrame(columns=[ 'time', 'job_id', 'type', 'query', 'status', 'created_at', 'start_at', 'org_name', 'database', 'user_name' ]) for item in data: record = pandas.Series([ item['time'], item['job_id'], item['type'], item['query'], item['status'], item['created_at'], item['start_at'], item['org_name'], item['database'], item['user_name'] ], index=dataframe.columns) dataframe = dataframe.append(record, ignore_index=True) jar_path = TDSparkContextBuilder.default_jar_path() writer = SparkWriter(apikey=TD_API_KEY, endpoint=TD_API_SERVER, td_spark_path=jar_path) with pytd.Client(apikey=TD_API_KEY, endpoint=TD_API_SERVER, database=TD_DATABASE, writer=writer) as client: client.load_table_from_dataframe(dataframe, TD_TABLE, if_exists='append')
def pull_resource_values(device_id, database, table): import pytd import pandas as pd from mbed_cloud import ConnectAPI from prestodb.exceptions import PrestoUserError # see PDM Python client ConnectAPI examples: # https://www.pelion.com/docs/device-management/current/mbed-cloud-sdk-python/subscriptions.html # https://github.com/ARMmbed/mbed-cloud-sdk-python/blob/c2bc539856cc6932e367ed47f7c2e64ef6e3a77a/examples/connect/notifications.py#L25-L45 api = ConnectAPI({ 'api_key': os.environ.get('PDM_API_KEY'), 'host': os.environ.get('PDM_HOST') }) # check device existence try: filters = {'id': {'$eq': device_id}} api.list_connected_devices(filters=filters).data except Exception: raise ValueError('cannot find the device: {}'.format(device_id)) column_names, row = [], [] for r in api.list_resources(device_id): # observable resources whose values can change over time: # https://www.pelion.com/docs/device-management/current/connecting/collecting-resources.html if r.type is None or not r.observable: continue try: value = api.get_resource_value(device_id, r.path) except Exception as e: print('skipped resource path {} due to {}'.format(r.path, e)) continue value = _cast(value) row.append(value) column_names.append(r.type) print('path: {}, type: {}, value: {}'.format(r.path, r.type, value)) if len(row) == 0: sys.exit("no resource values from device: {}".format(device_id)) client = pytd.Client(apikey=os.environ.get('TD_API_KEY'), endpoint=os.environ.get('TD_API_SERVER'), database=database) df = pd.DataFrame(**{'columns': column_names, 'data': [row]}) try: client.load_table_from_dataframe(df, table, writer='insert_into', if_exists='append') except PrestoUserError: raise RuntimeError("inserted record does not match to current schema " "of table `{}`. Make sure a list of available " "resources is same as previous execution, or " "change `writer` to `spark` or `bulk_import` to " "take advantage of schemaless record insertion." "".format(table))
def uploadDataToTD(dataframe, dest_db, dest_table): client = pytd.Client(apikey=TD_APIKEY, endpoint=TD_API_ENDPOINT, database=dest_db) client.load_table_from_dataframe(dataframe, dest_table, writer='bulk_import', if_exists='overwrite')
def read_td_table(database_name, table_name, engine_name='presto', limit=1000): import pytd import pandas as pd client = pytd.Client(apikey=apikey, endpoint=apiserver, database=database_name) res = client.query(f"select * from {table_name} limit {limit}", engine=engine_name) df = pd.DataFrame(**res) print(df)
def uploadDataToTD(td_endpoint, td_api_key, dataframe, td_database, td_table): client = pytd.Client(apikey=td_api_key, endpoint=td_endpoint, database=td_database, default_engine='hive') client.load_table_from_dataframe(dataframe, td_table, writer='bulk_import', if_exists='append', ignore_index=True)
def rss_import(dest_db: str, dest_table: str, rss_url_list): df = pd.DataFrame( columns=['title','description','link'] ) ts = str(int(time.time())) for rss_url in rss_url_list: d = feedparser.parse(rss_url) for entry in d.entries: tmp_se = pd.Series( [ entry.title, entry.description, entry.link ], index=df.columns ) df = df.append( tmp_se, ignore_index=True ) #print(df) client = pytd.Client(apikey=TD_APIKEY, endpoint=TD_ENDPOINT, database=dest_db, engine='presto') client.load_table_from_dataframe(df, dest_table, if_exists='append')
def uploadOTDataToTD(td_endpoint, td_api_key, dataframe, td_db, td_table): try: client = pytd.Client( apikey=td_api_key, endpoint=td_endpoint, database=td_db, default_engine='hive') client.load_table_from_dataframe( dataframe, td_table, if_exists='overwrite') except : raise Exception('Error Connecting to TD database')
def get_since_id(): query = 'SELECT MAX(id) id FROM {db}.{table}'.format(db=TD_DATABASE, table=TD_TABLE) with pytd.Client(apikey=TD_API_KEY, endpoint=TD_API_SERVER, database=TD_DATABASE) as client: result = client.query(query) if type(result['data'][0][0]) is int: return int(result['data'][0][0]) else: return 0
def bulk_load(data): dataframe = pandas.DataFrame(columns=mp.keys()) for item in data: record = pandas.Series(list(item.values()), index=dataframe.columns) dataframe = dataframe.append(record, ignore_index=True) with pytd.Client(apikey=TD_API_KEY, endpoint=TD_API_SERVER, database=TD_DATABASE) as client: client.load_table_from_dataframe(dataframe, TD_TABLE, if_exists='append')
def write_td_table(database_name, table_name): import pytd import pandas as pd import random client = pytd.Client(apikey=apikey, endpoint=apiserver, database=database_name) df = pd.DataFrame({"c":[random.random() for _ in range(20)]}) client.create_database_if_not_exists(database_name) table_path = f"{database_name}.{table_name}" client.load_table_from_dataframe(df, table_path, if_exists="overwrite")
def bundle_append(bundles='', engine_name='presto', max_recs_to_process=100, api_batch_limit=100): # initialise connections global global_oauth_token, global_db_client, database_name, global_dest_dict, global_dest_df, api_options global_oauth_token = get_oauth_token() global_db_client = pytd.Client(database=database_name) global_dest_df = pd.DataFrame( columns=['customer_id', 'key', 'value', 'bundle']) start_time = time.time() pii_df = read_source_pii(engine_name, max_recs_to_process) if isinstance(pii_df, pd.DataFrame): if not pii_df.empty: # Adjust api batch limit to avoid splitting input records over multiple POST requests api_batch_limit = int( api_batch_limit / len(api_method.split(','))) * len( api_method.split(',')) # round down to full batchsize if api_batch_limit == 0: api_batch_limit = len(api_method.split(',')) print("--- API Batch Limit set to: " + str(api_batch_limit)) get_ds_api_batch(pii_df, bundles, api_options, api_batch_limit) if len(global_dest_dict) > 0: drop_dest_table() create_dest_table() if debug_level > 0: print("Debug - Appending dictionary to Dataframe") global_dest_df = global_dest_df.append(global_dest_dict, ignore_index=True) if debug_level > 0: print("Debug - Loading table to database") global_db_client.load_table_from_dataframe(global_dest_df, dest_table, if_exists='append') else: print("--- No records retrieved for processing through DSAPI ") else: print("--- SQL Extraction error. No records to process through DSAPI ") print("--- Total Append execution time: {:.3f} seconds ---".format( (time.time() - start_time))) ### Example execution when running outside of Treasure Data. Comment out when running on TD ### # bundle_append(bundles='id,personIds,householdId,businessIds,ukDemoAttributes,ukPostcode,ukBasicDemographics',max_recs_to_process=1) # bundle_append(bundles='',max_recs_to_process=1000,api_batch_limit=100) # read_results(100000,bundles_filter='clientIdentityGraph.householdId') ### End of section ###
def load_sense_video(database, table, bucket, device_id, date): os.system(f"{sys.executable} -m pip install -U pytd==0.8.0 td-client") import pytd client = pytd.Client(database=database) s3 = boto3.client( "s3", aws_access_key_id=os.getenv("S3_ACCESS_KEY_ID"), aws_secret_access_key=os.getenv("S3_SECRET_ACCESS_KEY"), ) response = s3.list_objects( Bucket=bucket, Prefix="AnalyzedData/Rule/future_standard.sensevideo.1.0/{}/{}/texts".format( device_id, date ), ) if "Contents" not in response: print("found no results") return for c in response["Contents"]: key = c["Key"] f = tempfile.NamedTemporaryFile(suffix=".tgz") s3.download_fileobj(bucket, key, f) print("downloaded file: {} -> {}".format(key, f.name)) try: tar = tarfile.open(name=f.name, mode="r:gz") except: print("-> skipped due to read failure") f.close() continue tsvs = list(tsv_files(tar)) if len(tsvs) == 0: tar.close() f.close() continue tsv = tsvs[0] tar.extract(member=tsv) print("reading TSV: {}".format(tsv.name)) df = pd.read_csv(tsv.name, sep="\t", header=None, names=column_names) client.load_table_from_dataframe(df, table, writer="spark", if_exists="append") os.remove(os.path.abspath(tsv.name)) tar.close() f.close()
def read_td_table(database_name, table_name, limit=1000): import pytd import pandas as pd apikey = os.environ["TD_API_KEY"] apiserver = os.environ["TD_API_SERVER"] client = pytd.Client(apikey=apikey, endpoint=apiserver, database=database_name) res = client.query(f"select * from {table_name} limit {limit}") df = pd.DataFrame(**res) print(df)
def main(): # Create a TD client instance. client = pytd.Client(apikey=TD_API_KEY, endpoint=TD_API_SERVER, database=TD_DATABASE) # Retrieves the details for each job. cnt = limit page = 1 while cnt == limit: cnt = 0 for delivery in cuenote.call_api("getDelivList", { "limit": str(limit), "page": str(page) }).iter("deliv_jobqueue"): keys = jobinfo.keys() for info_items in cuenote.call_api( "getDelivInfo", { "delivid": delivery.attrib["delivid"] }).iter("jobinfo"): for key in keys: jobinfo[key] += [ cuenote.format_value(key, info_items.attrib[key]) ] cnt += 1 page += 1 df_jobinfo = pandas.DataFrame(jobinfo.values(), index=jobinfo.keys()).T # Request CN to generate logs for each delivery. expids = {"expid": []} for i in range(len(jobinfo["delivid"])): if jobinfo["delivtime"][i] >= (int(time.time()) - (60 * 60 * 24 * 14)): for expid in cuenote.call_api("startExport", { "delivid": jobinfo["delivid"][i], "strcode": "utf8" }).iter("expid"): expids["expid"] += [int(expid.text)] df_expids = pandas.DataFrame(expids.values(), index=expids.keys()).T # Refresh Job Info table. client.load_table_from_dataframe(df_jobinfo, "jobinfo", writer="bulk_import", if_exists="overwrite") # Insert expids into the queue table. if expids["expid"]: client.load_table_from_dataframe(df_expids, "queue", writer="bulk_import", if_exists="overwrite")
def main(database, date, n_split, n_features, timezone): n_split = int(n_split) n_features = int(n_features) client = pytd.Client(apikey=apikey, endpoint=endpoint) result = client.query(f''' select features, target from {database}.train where is_test = 0 ''') target = [r[1] for r in result['data']] features = create_sparse_features(result, n_features) clf = RandomForestClassifier(n_estimators=100) clf.fit(features, target) print(target[0], features) del target, features gc.collect() for i in range(n_split): result = client.query(f''' select features, cookie from {database}.preprocessed where nth_group = {i} ''') features = create_sparse_features(result, n_features) pred = clf.predict(features) cookies = [r[1] for r in result['data']] time = int( dt.datetime.strptime(date, '%Y-%m-%d').astimezone( gettz(timezone)).timestamp()) pred_df = pd.DataFrame({ 'pred': pred, 'cookie': cookies }).assign(time=time) del pred, cookies gc.collect() td.to_td(pred_df, f'{database}.predicted', con=con, if_exists='append', time_col='time', index=False)
def get_row_count(dest_db: str, dest_table: str): df = pd.DataFrame(columns=['db_name', 'table_name', 'row_count']) client = pytd.Client(apikey=TD_APIKEY, endpoint=TD_ENDPOINT, database=dest_db, default_engine='presto') for db in client.list_databases(): for table in client.list_tables(db.name): tmp_se = pd.Series([db.name, table.name, table.count], index=df.columns) df = df.append(tmp_se, ignore_index=True) #print(db.name + ',' + table.name + ',' + str(table.count)) #print(df) client.load_table_from_dataframe(df, dest_table, if_exists='append')
def main(url, database, table, csv_setting_file): df = pd.read_csv(url) column_setting = json.load(open(csv_setting_file, "r")) header_mapping = {e["from_name"]: e["to_name"] for e in column_setting} df_en = df.rename(columns=header_mapping) ignore_columns = [e["to_name"] for e in column_setting if e["ignore"]] droped_df = df_en.drop(columns=ignore_columns) client = pytd.Client(apikey=os.getenv('apikey'), database=database) client.load_table_from_dataframe(droped_df, table, writer='bulk_import', if_exists='append')
def paginate(db: str, table: str, group_by: str, user_rnd_column: str, count_per_page: int): client = pytd.Client(database=db) total_count = client.query(f'select COUNT(DISTINCT({group_by})) from {table}')['data'][0][0] batches = list(range(0, count_per_page * total_count // count_per_page, count_per_page)) + [total_count] batches = [b / total_count for b in batches] print(f'==Total users: {total_count}. Splitting into {len(batches)} batches') for start, end in zip(batches[:-1], batches[1:]): query = TDConnector.batch_query(table=table, start=start, end=end, user_rnd_column=user_rnd_column) print(f'==Fetching {query}') results = client.query(query) columns = results['columns'] data = results['data'] yield pd.DataFrame(data=data, columns=columns)
def write_td_table(database_name, table_name): import pytd import pandas as pd apikey = os.environ['TD_API_KEY'] apiserver = os.environ['TD_API_SERVER'] client = pytd.Client(apikey=apikey, endpoint=apiserver, database=database_name) df = pd.DataFrame(data={"col1": [1, 2, 4], "col2": [1.0, 2.0, 3.0]}) client.create_database_if_not_exists(database_name) client.load_table_from_dataframe(df, f"{database_name}.{table_name}", if_exists="overwrite")
def rss_import(dest_db: str, dest_table: str, rss_url_list): import pytd import feedparser posts = [] for rss_url in rss_url_list: d = feedparser.parse(rss_url) for entry in d.entries: posts.append((entry.title, entry.description, entry.link)) df = pd.DataFrame(posts, columns=["title", "description", "link"]) client = pytd.Client(apikey=TD_APIKEY, endpoint=TD_ENDPOINT, database=dest_db) client.create_database_if_not_exists(dest_db) client.load_table_from_dataframe(df, dest_table, if_exists="append")