def create_VP_tables(self): print "Beginning the creation of VP tables." total_properties = len(self.properties) i = 0 # for each distinct property, create a table for p in self.properties: i += 1 prop_df = self.sqlContext.sql( "SELECT s AS s, o AS o FROM tripletable WHERE p='" + p + "'") df_writer = DataFrameWriter(prop_df) df_writer.saveAsTable("VP_" + valid_string(p)) sys.stdout.write("\rTables created: %d / %d " % (i, total_properties)) # if statistics are enabled, compute them if self.statsEnabled: i = 0 stat = Stats() for p in self.properties: i += 1 tableDF = self.sqlContext.sql("SELECT * FROM VP_" + valid_string(p)) stat.addTableStat(p, tableDF) sys.stdout.write("\rStatistics created: %d / %d " % (i, total_properties)) with open(self.statsFile, "w") as f: f.write(stat.getSerializedStats()) print "Statistics created: %d / %d " % (i, total_properties)
def main(): sc = SparkContext() hc = HiveContext(sc) df = hc.sql("""{{sql}}""") df_writer = DataFrameWriter(df) df_writer.saveAsTable(name='{{tableName}}', format='json', mode='overwrite', path='s3://data/{{tableName}}')
phone = ob.get("bankpremobile", None) else: phone = None return (phone, idCard, idBank, name) # xiaoshudian_app_key = "1186159692" # xiaoshudian_tec_api = ["tel","address_getbymobile","channel_NameIDCardAccountVerify","channel_cellphone","operator_capricorn","address_match","channel_idcard","channel_bankby3","channel_idNameFase","channel_criminal","channel_blacklistverify","credit_implement"] # def filter(app_key,api): # not (app_key in xiaoshudian_app_key and api not in xiaoshudian_tec_api) data_rdd = data.rdd.map(lambda a: (a.app_key_param, a.date, standard_params(a.params), a.interface, a.api_type)) \ .map(lambda (a, b, c, d, e): (a, b, c[0], c[1], c[2], c[3], d, e)) ''' c[0]->phone c[1]->idcard c[2]->idbank c[3]->name ''' schemaStr = "app_key date phone idcard idbank name interface api_type" fields = [ StructField(field_name, StringType(), True) for field_name in schemaStr.split() ] schema = StructType(fields) data_df = hc.createDataFrame(data_rdd, schema).distinct() dfw = DataFrameWriter(data_df) dfw.saveAsTable("wl_analysis.t_lel_record_data_backflow", mode="overwrite")
if ob.has_key("mobile"): phone = ob.get("mobile", None) elif ob.has_key("phone"): phone = ob.get("phone", None) elif ob.has_key("enc_m"): phone = ob.get("enc_m", None) elif ob.has_key("ownerMobile"): phone = ob.get("ownerMobile", None) elif ob.has_key("bankpremobile"): phone = ob.get("bankpremobile", None) else: phone = None return (phone, idCard, idBank, name) # xiaoshudian_app_key = "1186159692" # xiaoshudian_tec_api = ["tel","address_getbymobile","channel_NameIDCardAccountVerify","channel_cellphone","operator_capricorn","address_match","channel_idcard","channel_bankby3","channel_idNameFase","channel_criminal","channel_blacklistverify","credit_implement"] # def filter(app_key,api): # not (app_key in xiaoshudian_app_key and api not in xiaoshudian_tec_api) data_rdd = data.rdd.map(lambda a:(a.app_key_param,a.date,standard_params(a.params_less),a.interface,a.api_type))\ .map(lambda (a,b,c,d,e):(a,b,c[0],c[1],c[2],c[3],d,e)) schemaStr = "app_key date phone idcard idbank name interface api_type" fields = [ StructField(field_name, StringType(), True) for field_name in schemaStr.split() ] schema = StructType(fields) data_df = sqlContext.createDataFrame(data_rdd, schema).distinct() dfw = DataFrameWriter(data_df) dfw.saveAsTable("wl_analysis.t_lel_datamart_backflow_filtered", mode="overwrite")