def load_user(): data_name = 'user_data' dataset = 'master/csv' dd_schema = StructType([ StructField('index', IntegerType(), True), StructField('id', StringType(), True), StructField('name', StringType(), True), StructField('real_name', StringType(), True), StructField('tz', StringType(), True), StructField('email', StringType(), True), StructField('batch', StringType(), True) ]) dd = tl.dd_readfile(dataset, data_name) df = spark.createDataFrame(dd, dd_schema) df.show() table_name = data_name.upper() col_types = alter_column(df, table_name, 'email') col_types = col_types + ', ' + alter_column(df, table_name, 'name') col_types = col_types + ', ' + alter_column(df, table_name, 'real_name') df.write.option("createTableColumnTypes", col_types).jdbc(url, data_name, mode='overwrite', properties={ 'user': user, 'password': password, 'driver': driver })
def load_channel(): data_name = 'channel_data' dataset = 'master/csv' schema = StructType([ StructField('index', IntegerType(), True), StructField('id', StringType(), True), StructField('name', StringType(), True), StructField('type', StringType(), True), StructField('class', StringType(), True), StructField('is_archived', StringType(), True), StructField('is_private', StringType(), True) ]) dd = tl.dd_readfile(dataset, data_name) df = spark.createDataFrame(dd, schema) df.show() table_name = data_name.upper() col_types = alter_column(df, table_name, 'name') df.write.option("createTableColumnTypes", col_types).jdbc(url, data_name, mode='overwrite', properties={ 'user': user, 'password': password, 'driver': driver })
def load_node(): data_name = 'node_data' dataset = 'metrics/csv' schema = StructType([ StructField('id', StringType(), True), StructField('label', StringType(), True), StructField('type', StringType(), True), StructField('name', StringType(), True), StructField('iso', StringType(), True), StructField('country', StringType(), True), StructField('city', StringType(), True), StructField('channel_type', StringType(), True), StructField('channel_class', StringType(), True) ]) dd = tl.dd_readfile(dataset, data_name) df = spark.createDataFrame(dd, schema) df.show() df.write.jdbc(url, data_name, mode='overwrite', properties={ 'user': user, 'password': password, 'driver': driver })
def load_tag(): data_name = 'tag_data' dataset = 'metrics/csv' schema = StructType([ StructField('channel', StringType(), True), StructField('ts', StringType(), True), StructField('tag', StringType(), True), StructField('nice_tag', StringType(), True), StructField('type', StringType(), True) ]) dd = tl.dd_readfile(dataset, data_name) df = spark.createDataFrame(dd, schema) df.show() table_name = data_name.upper() col_types = alter_column(df, table_name, 'tag') col_types = col_types + ', ' + alter_column(df, table_name, 'nice_tag') df.write.option("createTableColumnTypes", col_types).jdbc(url, table_name, mode='overwrite', properties={ 'user': user, 'password': password, 'driver': driver })
def load_file(): data_name = 'file_data' dataset = 'master/csv' schema = StructType([ StructField('index', IntegerType(), True), StructField('id', StringType(), True), StructField('channel', StringType(), True), StructField('name', StringType(), True), StructField('time', StringType(), True), StructField('user_id', StringType(), True) ]) dd = tl.dd_readfile(dataset, data_name) df = spark.createDataFrame(dd, schema) df = df.withColumnRenamed("time", "time_as_str") df = df.withColumn("time", to_timestamp(col("time_as_str"))) df = df.drop("time_as_str") df.show() table_name = data_name.upper() col_types = alter_column(df, table_name, 'name') df.write.option("createTableColumnTypes", col_types).jdbc(url, data_name, mode='overwrite', properties={ 'user': user, 'password': password, 'driver': driver })
def load_channel_ref(): data_name = 'channel_ref' dataset = 'references' schema = StructType([ StructField('id', StringType(), True) ,StructField('index', IntegerType(), True) ,StructField('category_name', StringType(), True) ,StructField('category_index', IntegerType(), True) ]) dd = tl.dd_readfile(dataset,data_name) df = spark.createDataFrame(dd,schema) df.show() df.write.jdbc(url, data_name, mode='overwrite', properties={'user': user, 'password': password, 'driver': driver})
def load_edge(): data_name = 'edge_data' dataset = 'metrics/csv' schema = StructType([ StructField('channel', StringType(), True) ,StructField('source', StringType(), True) ,StructField('target', StringType(), True) ,StructField('relate', StringType(), True) ]) dd = tl.dd_readfile(dataset,data_name) df = spark.createDataFrame(dd,schema) df.show() df.write.jdbc(url, data_name, mode='overwrite', properties={'user': user, 'password': password, 'driver': driver})
def load_reaction(): data_name = 'reaction_data' dataset = 'master/csv' schema = StructType([ StructField('index', IntegerType(), True) ,StructField('channel', StringType(), True) ,StructField('ts', StringType(), True) ,StructField('thread_ts', StringType(), True) ,StructField('user_id', StringType(), True) ,StructField('reaction', StringType(), True) ]) dd = tl.dd_readfile(dataset,data_name) df = spark.createDataFrame(dd,schema) df.show() df.write.jdbc(url, data_name, mode='overwrite', properties={'user': user, 'password': password, 'driver': driver})
def load_user(): data_name = 'user_data' dataset = 'master/csv' dd_schema = StructType([ StructField('index', IntegerType(), True) ,StructField('id', StringType(), True) ,StructField('name', StringType(), True) ,StructField('real_name', StringType(), True) ,StructField('tz', StringType(), True) ,StructField('email', StringType(), True) ,StructField('batch', StringType(), True) ]) dd = tl.dd_readfile(dataset,data_name) df = spark.createDataFrame(dd,dd_schema) df.show() df.write.jdbc(url, data_name, mode='overwrite', properties={'user': user, 'password': password, 'driver': driver})
def load_poll(): data_name = 'poll_data' dataset = 'master/csv' schema = StructType([ StructField('index', IntegerType(), True) ,StructField('poll_id', StringType(), True) ,StructField('ts', StringType(), True) ,StructField('time', StringType(), True) ,StructField('text', StringType(), True) ,StructField('vote_item', StringType(), True) ,StructField('vote_count', IntegerType(), True) ]) dd = tl.dd_readfile(dataset,data_name) df = spark.createDataFrame(dd,schema) df = df.withColumnRenamed("time","time_as_str") df = df.withColumn("time",to_timestamp(col("time_as_str"))) df = df.drop("time_as_str") df.show() df.write.jdbc(url, data_name, mode='overwrite', properties={'user': user, 'password': password, 'driver': driver})
def load_conversation(): data_name = 'conversation_data' dataset = 'metrics/csv' schema = StructType([ StructField('channel', StringType(), True), StructField('type', StringType(), True), StructField('subtype', StringType(), True), StructField('ts', StringType(), True), StructField('thread_ts', StringType(), True), StructField('ts_int', DoubleType(), True), StructField('time', StringType(), True), StructField('user_id', StringType(), True), StructField('real_name', StringType(), True), StructField('name', StringType(), True), StructField('text', StringType(), True), StructField('city', StringType(), True), StructField('country', StringType(), True), StructField('iso', StringType(), True) ]) dd = tl.dd_readfile(dataset, data_name) df = spark.createDataFrame(dd, schema) df = df.withColumnRenamed("time", "time_as_str") df = df.withColumn("time", to_timestamp(col("time_as_str"))) df = df.drop("time_as_str") df.show() table_name = data_name.upper() col_types = alter_column(df, table_name, 'real_name') col_types = col_types + ', ' + alter_column(df, table_name, 'text') df.write.option("createTableColumnTypes", col_types).jdbc(url, data_name, mode='overwrite', properties={ 'user': user, 'password': password, 'driver': driver })