Example #1
0
def upload_dataset(database, table):
    client = pytd.Client(
        database=database,
        apikey=os.environ["TD_API_KEY"],
        endpoint=os.environ["TD_API_SERVER"],
    )

    if client.exists(database, table):
        print("Target database and table exist. Skip")
        return True

    target_url = "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
    fname = os.path.join("ml-1m", "ratings.dat")
    with urlopen(target_url) as res:
        info = res.info()
        if info.get_content_type() != "application/zip":
            raise IOError("Target file isn't Zip file. Might be corrupt")

        with zipfile.ZipFile(io.BytesIO(res.read())) as zipf:
            zipf.extract(fname)

    df = pd.read_csv(
        fname,
        sep="::",
        engine="python",
        names=["userid", "itemid", "rating", "timestamp"],
    )
    client.create_database_if_not_exists(database)
    client.load_table_from_dataframe(df, f"{database}.{table}", if_exists="overwrite")

    shutil.rmtree("ml-1m")
def read_results(limit, bundles_filter):
    # Read PII from DS-API output table and print pipe delimited in stdout
    global database_name, dest_table, global_db_client
    try:
        global_db_client = pytd.Client(database=database_name)
        bundle_where = ""
        if bundles_filter:
            for bundle in bundles_filter.split(","):
                if bundle_where == "":
                    bundle_where = bundle_where + " where key like '%." + bundle + "%' "
                else:
                    bundle_where = bundle_where + " or key like '%." + bundle + "%' "

        sql = "select * from " + dest_table + bundle_where + " limit " + str(
            limit)
        print(sql)
        res = global_db_client.query(sql)

        print("ID|KEY|VALUE")
        for record in res["data"]:
            print(str(record[0]) + "|" + record[1] + "|" + record[2])

    except Exception as e:
        raise Exception("Exception reading source table - " + str(e))
    finally:
        return
Example #3
0
def main():
    # Create a TD client instance.
    client = pytd.Client(apikey=TD_API_KEY,
                         endpoint=TD_API_SERVER,
                         database=TD_DATABASE)

    # Retrieves the details for each job.
    cursor = 0
    keys = jobinfo.keys()
    while cursor is not None:
        result = cuenote.call_api("delivery", {
            "limit": "50",
            "cursor": str(cursor)
        })
        jobs = result.json()
        for job in jobs["list"]:
            for key in keys:
                if key in job:
                    jobinfo[key] += [cuenote.format_value(key, job[key])]
                else:
                    jobinfo[key] += ""
        cursor = jobs["next_cursor"]
    df_jobinfo = pandas.DataFrame(jobinfo.values(), index=jobinfo.keys()).T

    # Refresh Job Info table.
    client.load_table_from_dataframe(df_jobinfo,
                                     "jobinfo",
                                     writer="bulk_import",
                                     if_exists="overwrite")
Example #4
0
def run(database, table):
    apikey = os.environ["TD_API_KEY"]
    apiserver = os.environ["TD_API_SERVER"]
    client = pytd.Client(apikey=apikey, endpoint=apiserver)

    # Write your execution code
    return True
Example #5
0
def import_table(database, table):
    import pandas as pd
    import pytd

    # http://archive.ics.uci.edu/ml/datasets/Online+Retail#
    url = "http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx"

    df = pd.read_excel(url,
                       dtype={
                           "InvoiceNo": str,
                           "InvoiceDate": str,
                           "CustomerID": float,
                           "Country": str,
                           "StockCode": str,
                           "Description": str,
                           "UnitPrice": float,
                           "Quantity": int
                       })
    df = df.loc[pd.notnull(df.CustomerID)]
    df.CustomerID = df.CustomerID.astype(int)

    client = pytd.Client(apikey=os.environ.get('TD_API_KEY'),
                         endpoint=os.environ.get('TD_API_SERVER'),
                         database=database)
    client.load_table_from_dataframe(df, table, if_exists="overwrite")
Example #6
0
def call_api(sql, url, replaced_param):

  print("Reading sqlfile: {}".format(sql))
  f = open(sql, 'r')
  sqlcontent = f.read()
  f.close

  print("SQL content is:")
  print(sqlcontent)

  client = pytd.Client(
    apikey=os.environ.get('td_apikey'),
    endpoint=os.environ.get('td_endpoint'),
    database=os.environ.get('td_database'),
    default_engine=os.environ.get('td_engine')
    )
  conn = connect(client)

  print("API call start:")
  for index, row in fetch_row(sqlcontent, conn):
    u = url.replace(replaced_param, row)
    print(index, u)
    r = requests.get(u)
    print(r)

  print("API call finished")
def upload(sqlfile, database, presigned_url):
    with open(sqlfile) as f:
        querytxt = f.read()

    client = pytd.Client(apikey=os.getenv("td_apikey"), database=database)
    res = client.query(querytxt)
    df = pd.DataFrame(**res)
    csv = df.to_csv(header=False, index=False, sep="\t")

    logger.info("---- user list as first 10 lines----")
    logger.info("\n".join(csv.splitlines()[:10]))
    logger.info("---- Total number of IDs = " + str(len(csv.splitlines())) +
                "----")

    res = requests.put(
        presigned_url,
        data=gzip.compress(bytes(csv, "utf-8")),
        headers={"Content-Encoding": "gzip"},
    )

    if res.status_code != 200:
        logger.error(
            f"Failed to call Yahoo API with http status code {res.status_code}"
        )
        logger.error(res.text)
        sys.exit(os.EX_DATAERR)
    else:
        logger.info(
            f"Succeeded calling Yahoo API with http status code {res.status_code}"
        )
Example #8
0
    def __init__(
        self,
        api_key,
        api_server,
        database,
        user_size,
        non_null_rate,
        verbose,
        overwrite,
        dry_run,
    ):
        self.client = pytd.Client(apikey=api_key,
                                  endpoint=api_server,
                                  database=database)
        self.database = database
        self.overwrite = overwrite
        self.dry_run = dry_run

        if user_size < 100 or user_size > self.MAX_ADJUSTMENT_COEF:
            raise ValueError(f"user_size should be between 100 to 100000000")

        self.user_size = user_size
        self.non_null_rate = non_null_rate

        if verbose > 2:
            verbose = 2
        levels = {0: logging.WARN, 1: logging.INFO, 2: logging.DEBUG}

        self.logger = logging.getLogger(__name__)
        ch = logging.StreamHandler()
        self.logger.setLevel(levels[verbose])
        ch.setLevel(levels[verbose])
        self.logger.addHandler(ch)
def rss_import(dest_db: str, dest_table: str, rss_url_list):
    import feedparser
    import pytd
    import pandas as pd


    posts = []
    # Fetch RSS data for each URL in `rss_url_list`
    for rss_url in rss_url_list:
        # Parse RSS into Python dictionary
        d = feedparser.parse(rss_url)
        # Get title, description, and link of each entry in an RSS
        for entry in d.entries:
            # You can modify this line to get other fields e.g., summary
            posts.append((entry.title, entry.description, entry.link))

    # Create pandas DataFrame of posts
    # If you want to add other fields, you need to modify `columns` argument
    df = pd.DataFrame(posts, columns=["title", "description", "link"])
    # Create pytd client
    client = pytd.Client(apikey=apikey, endpoint=apiserver, database=dest_db)
    # Create `dest_db` database if not exists
    client.create_database_if_not_exists(dest_db)
    # Upload and append the dataframe to TD as `dest_db`.`dest_table`
    client.load_table_from_dataframe(df, dest_table, if_exists="append")
Example #10
0
  def predict(self, data_id, model_id, dest_database, dest_table):
    import json, requests, digdag
    import pandas as pd
    import pytd
  
    token = digdag.env.params['token']
  
    headers = {
        'skyfox-api-access-token': token,
        'Content-Type': 'application/json',
    }
    
    data = json.dumps({ 
      "dataId": data_id,
      "deleteData": False,
      "options": {
        "withInput": True
        }
    })
    response = requests.post(f'https://{skyfox_endpoint}/models/{model_id}/predictions', headers=headers, data=data)
      
    prediction_id = json.loads(response.text)['id']
    assert response.ok, f"Status Code : {response.status_code} | {response.text}"
  
    response = self._check_completed_predicting(model_id, prediction_id)

    results = pd.DataFrame(json.loads(response.text)['results'])
    
    client = pytd.Client(apikey=td_apikey, endpoint=td_endpoint)
    client.load_table_from_dataframe(results.assign(time=session_unixtime), f'{dest_database}.{dest_table}', if_exists='append')

    return True
Example #11
0
def bulk_load(data):
    dataframe = pandas.DataFrame(columns=[
        'time', 'job_id', 'type', 'query', 'status', 'created_at', 'start_at',
        'org_name', 'database', 'user_name'
    ])
    for item in data:
        record = pandas.Series([
            item['time'], item['job_id'], item['type'], item['query'],
            item['status'], item['created_at'], item['start_at'],
            item['org_name'], item['database'], item['user_name']
        ],
                               index=dataframe.columns)
        dataframe = dataframe.append(record, ignore_index=True)

    jar_path = TDSparkContextBuilder.default_jar_path()
    writer = SparkWriter(apikey=TD_API_KEY,
                         endpoint=TD_API_SERVER,
                         td_spark_path=jar_path)
    with pytd.Client(apikey=TD_API_KEY,
                     endpoint=TD_API_SERVER,
                     database=TD_DATABASE,
                     writer=writer) as client:
        client.load_table_from_dataframe(dataframe,
                                         TD_TABLE,
                                         if_exists='append')
Example #12
0
def pull_resource_values(device_id, database, table):
    import pytd
    import pandas as pd
    from mbed_cloud import ConnectAPI
    from prestodb.exceptions import PrestoUserError

    # see PDM Python client ConnectAPI examples:
    # https://www.pelion.com/docs/device-management/current/mbed-cloud-sdk-python/subscriptions.html
    # https://github.com/ARMmbed/mbed-cloud-sdk-python/blob/c2bc539856cc6932e367ed47f7c2e64ef6e3a77a/examples/connect/notifications.py#L25-L45
    api = ConnectAPI({
        'api_key': os.environ.get('PDM_API_KEY'),
        'host': os.environ.get('PDM_HOST')
    })

    # check device existence
    try:
        filters = {'id': {'$eq': device_id}}
        api.list_connected_devices(filters=filters).data
    except Exception:
        raise ValueError('cannot find the device: {}'.format(device_id))

    column_names, row = [], []
    for r in api.list_resources(device_id):
        # observable resources whose values can change over time:
        # https://www.pelion.com/docs/device-management/current/connecting/collecting-resources.html
        if r.type is None or not r.observable:
            continue

        try:
            value = api.get_resource_value(device_id, r.path)
        except Exception as e:
            print('skipped resource path {} due to {}'.format(r.path, e))
            continue
        value = _cast(value)
        row.append(value)

        column_names.append(r.type)

        print('path: {}, type: {}, value: {}'.format(r.path, r.type, value))

    if len(row) == 0:
        sys.exit("no resource values from device: {}".format(device_id))

    client = pytd.Client(apikey=os.environ.get('TD_API_KEY'),
                         endpoint=os.environ.get('TD_API_SERVER'),
                         database=database)
    df = pd.DataFrame(**{'columns': column_names, 'data': [row]})

    try:
        client.load_table_from_dataframe(df, table, writer='insert_into',
                                         if_exists='append')
    except PrestoUserError:
        raise RuntimeError("inserted record does not match to current schema "
                           "of table `{}`. Make sure a list of available "
                           "resources is same as previous execution, or "
                           "change `writer` to `spark` or `bulk_import` to "
                           "take advantage of schemaless record insertion."
                           "".format(table))
Example #13
0
def uploadDataToTD(dataframe, dest_db, dest_table):
    client = pytd.Client(apikey=TD_APIKEY,
                         endpoint=TD_API_ENDPOINT,
                         database=dest_db)

    client.load_table_from_dataframe(dataframe,
                                     dest_table,
                                     writer='bulk_import',
                                     if_exists='overwrite')
Example #14
0
def read_td_table(database_name, table_name, engine_name='presto', limit=1000):
    import pytd
    import pandas as pd

    client = pytd.Client(apikey=apikey, endpoint=apiserver, database=database_name)

    res = client.query(f"select * from {table_name} limit {limit}", engine=engine_name)
    df = pd.DataFrame(**res)
    print(df)
Example #15
0
def uploadDataToTD(td_endpoint, td_api_key, dataframe, td_database, td_table):
    client = pytd.Client(apikey=td_api_key,
                         endpoint=td_endpoint,
                         database=td_database,
                         default_engine='hive')
    client.load_table_from_dataframe(dataframe,
                                     td_table,
                                     writer='bulk_import',
                                     if_exists='append',
                                     ignore_index=True)
Example #16
0
def rss_import(dest_db: str, dest_table: str, rss_url_list):
    df = pd.DataFrame( columns=['title','description','link'] )
    ts = str(int(time.time()))
    for rss_url in rss_url_list:
        d = feedparser.parse(rss_url)
        for entry in d.entries:
            tmp_se = pd.Series( [ entry.title, entry.description, entry.link ], index=df.columns )
            df = df.append( tmp_se, ignore_index=True )
    #print(df)
    client = pytd.Client(apikey=TD_APIKEY, endpoint=TD_ENDPOINT, database=dest_db, engine='presto')
    client.load_table_from_dataframe(df, dest_table, if_exists='append')
def uploadOTDataToTD(td_endpoint, td_api_key, dataframe, td_db, td_table):
    try:
        client = pytd.Client(
            apikey=td_api_key,
            endpoint=td_endpoint,
            database=td_db,
            default_engine='hive')
        client.load_table_from_dataframe(
            dataframe, td_table, if_exists='overwrite') 
    except :
        raise Exception('Error Connecting to TD database')
def get_since_id():
    query = 'SELECT MAX(id) id FROM {db}.{table}'.format(db=TD_DATABASE,
                                                         table=TD_TABLE)
    with pytd.Client(apikey=TD_API_KEY,
                     endpoint=TD_API_SERVER,
                     database=TD_DATABASE) as client:
        result = client.query(query)
    if type(result['data'][0][0]) is int:
        return int(result['data'][0][0])
    else:
        return 0
def bulk_load(data):
    dataframe = pandas.DataFrame(columns=mp.keys())
    for item in data:
        record = pandas.Series(list(item.values()), index=dataframe.columns)
        dataframe = dataframe.append(record, ignore_index=True)

    with pytd.Client(apikey=TD_API_KEY,
                     endpoint=TD_API_SERVER,
                     database=TD_DATABASE) as client:
        client.load_table_from_dataframe(dataframe,
                                         TD_TABLE,
                                         if_exists='append')
Example #20
0
def write_td_table(database_name, table_name):
    import pytd
    import pandas as pd
    import random

    client = pytd.Client(apikey=apikey, endpoint=apiserver, database=database_name)
    df = pd.DataFrame({"c":[random.random() for _ in range(20)]})

    client.create_database_if_not_exists(database_name)

    table_path = f"{database_name}.{table_name}"
    client.load_table_from_dataframe(df, table_path, if_exists="overwrite")
Example #21
0
def bundle_append(bundles='',
                  engine_name='presto',
                  max_recs_to_process=100,
                  api_batch_limit=100):
    # initialise connections
    global global_oauth_token, global_db_client, database_name, global_dest_dict, global_dest_df, api_options

    global_oauth_token = get_oauth_token()
    global_db_client = pytd.Client(database=database_name)
    global_dest_df = pd.DataFrame(
        columns=['customer_id', 'key', 'value', 'bundle'])

    start_time = time.time()

    pii_df = read_source_pii(engine_name, max_recs_to_process)

    if isinstance(pii_df, pd.DataFrame):
        if not pii_df.empty:
            # Adjust api batch limit to avoid splitting input records over multiple POST requests
            api_batch_limit = int(
                api_batch_limit / len(api_method.split(','))) * len(
                    api_method.split(','))  # round down to full batchsize
            if api_batch_limit == 0:
                api_batch_limit = len(api_method.split(','))
            print("--- API Batch Limit set to: " + str(api_batch_limit))

            get_ds_api_batch(pii_df, bundles, api_options, api_batch_limit)
            if len(global_dest_dict) > 0:
                drop_dest_table()
                create_dest_table()
                if debug_level > 0:
                    print("Debug - Appending dictionary to Dataframe")
                global_dest_df = global_dest_df.append(global_dest_dict,
                                                       ignore_index=True)
                if debug_level > 0: print("Debug - Loading table to database")
                global_db_client.load_table_from_dataframe(global_dest_df,
                                                           dest_table,
                                                           if_exists='append')
        else:
            print("--- No records retrieved for processing through DSAPI ")
    else:
        print("--- SQL Extraction error. No records to process through DSAPI ")

    print("--- Total Append execution time: {:.3f} seconds ---".format(
        (time.time() - start_time)))


### Example execution when running outside of Treasure Data. Comment out when running on TD ###
# bundle_append(bundles='id,personIds,householdId,businessIds,ukDemoAttributes,ukPostcode,ukBasicDemographics',max_recs_to_process=1)
# bundle_append(bundles='',max_recs_to_process=1000,api_batch_limit=100)
# read_results(100000,bundles_filter='clientIdentityGraph.householdId')
### End of section ###
Example #22
0
def load_sense_video(database, table, bucket, device_id, date):
    os.system(f"{sys.executable} -m pip install -U pytd==0.8.0 td-client")

    import pytd

    client = pytd.Client(database=database)

    s3 = boto3.client(
        "s3",
        aws_access_key_id=os.getenv("S3_ACCESS_KEY_ID"),
        aws_secret_access_key=os.getenv("S3_SECRET_ACCESS_KEY"),
    )
    response = s3.list_objects(
        Bucket=bucket,
        Prefix="AnalyzedData/Rule/future_standard.sensevideo.1.0/{}/{}/texts".format(
            device_id, date
        ),
    )

    if "Contents" not in response:
        print("found no results")
        return

    for c in response["Contents"]:
        key = c["Key"]

        f = tempfile.NamedTemporaryFile(suffix=".tgz")
        s3.download_fileobj(bucket, key, f)

        print("downloaded file: {} -> {}".format(key, f.name))
        try:
            tar = tarfile.open(name=f.name, mode="r:gz")
        except:
            print("-> skipped due to read failure")
            f.close()
            continue
        tsvs = list(tsv_files(tar))
        if len(tsvs) == 0:
            tar.close()
            f.close()
            continue
        tsv = tsvs[0]

        tar.extract(member=tsv)

        print("reading TSV: {}".format(tsv.name))
        df = pd.read_csv(tsv.name, sep="\t", header=None, names=column_names)
        client.load_table_from_dataframe(df, table, writer="spark", if_exists="append")

        os.remove(os.path.abspath(tsv.name))
        tar.close()
        f.close()
Example #23
0
def read_td_table(database_name, table_name, limit=1000):
    import pytd
    import pandas as pd

    apikey = os.environ["TD_API_KEY"]
    apiserver = os.environ["TD_API_SERVER"]
    client = pytd.Client(apikey=apikey,
                         endpoint=apiserver,
                         database=database_name)

    res = client.query(f"select * from {table_name} limit {limit}")
    df = pd.DataFrame(**res)
    print(df)
Example #24
0
def main():
    # Create a TD client instance.
    client = pytd.Client(apikey=TD_API_KEY,
                         endpoint=TD_API_SERVER,
                         database=TD_DATABASE)

    # Retrieves the details for each job.
    cnt = limit
    page = 1
    while cnt == limit:
        cnt = 0
        for delivery in cuenote.call_api("getDelivList", {
                "limit": str(limit),
                "page": str(page)
        }).iter("deliv_jobqueue"):
            keys = jobinfo.keys()
            for info_items in cuenote.call_api(
                    "getDelivInfo", {
                        "delivid": delivery.attrib["delivid"]
                    }).iter("jobinfo"):
                for key in keys:
                    jobinfo[key] += [
                        cuenote.format_value(key, info_items.attrib[key])
                    ]
            cnt += 1
        page += 1
    df_jobinfo = pandas.DataFrame(jobinfo.values(), index=jobinfo.keys()).T

    # Request CN to generate logs for each delivery.
    expids = {"expid": []}
    for i in range(len(jobinfo["delivid"])):
        if jobinfo["delivtime"][i] >= (int(time.time()) - (60 * 60 * 24 * 14)):
            for expid in cuenote.call_api("startExport", {
                    "delivid": jobinfo["delivid"][i],
                    "strcode": "utf8"
            }).iter("expid"):
                expids["expid"] += [int(expid.text)]
    df_expids = pandas.DataFrame(expids.values(), index=expids.keys()).T

    # Refresh Job Info table.
    client.load_table_from_dataframe(df_jobinfo,
                                     "jobinfo",
                                     writer="bulk_import",
                                     if_exists="overwrite")

    # Insert expids into the queue table.
    if expids["expid"]:
        client.load_table_from_dataframe(df_expids,
                                         "queue",
                                         writer="bulk_import",
                                         if_exists="overwrite")
Example #25
0
def main(database, date, n_split, n_features, timezone):
    n_split = int(n_split)
    n_features = int(n_features)
    client = pytd.Client(apikey=apikey, endpoint=endpoint)

    result = client.query(f'''
    select
      features, target
    from
      {database}.train
    where
      is_test = 0
  ''')

    target = [r[1] for r in result['data']]
    features = create_sparse_features(result, n_features)

    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(features, target)
    print(target[0], features)

    del target, features
    gc.collect()

    for i in range(n_split):
        result = client.query(f'''
      select
        features, cookie
      from
        {database}.preprocessed
      where
        nth_group = {i}
    ''')
        features = create_sparse_features(result, n_features)
        pred = clf.predict(features)
        cookies = [r[1] for r in result['data']]
        time = int(
            dt.datetime.strptime(date, '%Y-%m-%d').astimezone(
                gettz(timezone)).timestamp())
        pred_df = pd.DataFrame({
            'pred': pred,
            'cookie': cookies
        }).assign(time=time)
        del pred, cookies
        gc.collect()
        td.to_td(pred_df,
                 f'{database}.predicted',
                 con=con,
                 if_exists='append',
                 time_col='time',
                 index=False)
Example #26
0
def get_row_count(dest_db: str, dest_table: str):
    df = pd.DataFrame(columns=['db_name', 'table_name', 'row_count'])
    client = pytd.Client(apikey=TD_APIKEY,
                         endpoint=TD_ENDPOINT,
                         database=dest_db,
                         default_engine='presto')
    for db in client.list_databases():
        for table in client.list_tables(db.name):
            tmp_se = pd.Series([db.name, table.name, table.count],
                               index=df.columns)
            df = df.append(tmp_se, ignore_index=True)
            #print(db.name + ',' + table.name + ',' + str(table.count))
    #print(df)
    client.load_table_from_dataframe(df, dest_table, if_exists='append')
def main(url, database, table, csv_setting_file):
    df = pd.read_csv(url)

    column_setting = json.load(open(csv_setting_file, "r"))
    header_mapping = {e["from_name"]: e["to_name"] for e in column_setting}
    df_en = df.rename(columns=header_mapping)

    ignore_columns = [e["to_name"] for e in column_setting if e["ignore"]]
    droped_df = df_en.drop(columns=ignore_columns)

    client = pytd.Client(apikey=os.getenv('apikey'), database=database)
    client.load_table_from_dataframe(droped_df,
                                     table,
                                     writer='bulk_import',
                                     if_exists='append')
Example #28
0
    def paginate(db: str, table: str, group_by: str, user_rnd_column: str, count_per_page: int):
        client = pytd.Client(database=db)

        total_count = client.query(f'select COUNT(DISTINCT({group_by})) from {table}')['data'][0][0]
        batches = list(range(0, count_per_page * total_count // count_per_page, count_per_page)) + [total_count]
        batches = [b / total_count for b in batches]
        print(f'==Total users: {total_count}. Splitting into {len(batches)} batches')

        for start, end in zip(batches[:-1], batches[1:]):
            query = TDConnector.batch_query(table=table, start=start, end=end,
                                            user_rnd_column=user_rnd_column)
            print(f'==Fetching {query}')
            results = client.query(query)
            columns = results['columns']
            data = results['data']
            yield pd.DataFrame(data=data, columns=columns)
Example #29
0
def write_td_table(database_name, table_name):
    import pytd
    import pandas as pd

    apikey = os.environ['TD_API_KEY']
    apiserver = os.environ['TD_API_SERVER']
    client = pytd.Client(apikey=apikey,
                         endpoint=apiserver,
                         database=database_name)

    df = pd.DataFrame(data={"col1": [1, 2, 4], "col2": [1.0, 2.0, 3.0]})

    client.create_database_if_not_exists(database_name)
    client.load_table_from_dataframe(df,
                                     f"{database_name}.{table_name}",
                                     if_exists="overwrite")
Example #30
0
def rss_import(dest_db: str, dest_table: str, rss_url_list):
    import pytd
    import feedparser

    posts = []
    for rss_url in rss_url_list:
        d = feedparser.parse(rss_url)
        for entry in d.entries:
            posts.append((entry.title, entry.description, entry.link))

    df = pd.DataFrame(posts, columns=["title", "description", "link"])
    client = pytd.Client(apikey=TD_APIKEY,
                         endpoint=TD_ENDPOINT,
                         database=dest_db)
    client.create_database_if_not_exists(dest_db)
    client.load_table_from_dataframe(df, dest_table, if_exists="append")