Exemple #1
0
    def create_VP_tables(self):
        print "Beginning the creation of VP tables."
        total_properties = len(self.properties)
        i = 0
        # for each distinct property, create a table
        for p in self.properties:
            i += 1
            prop_df = self.sqlContext.sql(
                "SELECT s AS s, o AS o FROM tripletable WHERE p='" + p + "'")
            df_writer = DataFrameWriter(prop_df)
            df_writer.saveAsTable("VP_" + valid_string(p))
            sys.stdout.write("\rTables created: %d / %d " %
                             (i, total_properties))

        # if statistics are enabled, compute them
        if self.statsEnabled:
            i = 0
            stat = Stats()
            for p in self.properties:
                i += 1
                tableDF = self.sqlContext.sql("SELECT * FROM VP_" +
                                              valid_string(p))
                stat.addTableStat(p, tableDF)
                sys.stdout.write("\rStatistics created: %d / %d " %
                                 (i, total_properties))
            with open(self.statsFile, "w") as f:
                f.write(stat.getSerializedStats())
        print "Statistics created: %d / %d " % (i, total_properties)
Exemple #2
0
def main():
    sc = SparkContext()
    hc = HiveContext(sc)

    df = hc.sql("""{{sql}}""")
    df_writer = DataFrameWriter(df)
    df_writer.saveAsTable(name='{{tableName}}',
                          format='json',
                          mode='overwrite',
                          path='s3://data/{{tableName}}')
        phone = ob.get("bankpremobile", None)
    else:
        phone = None
    return (phone, idCard, idBank, name)


# xiaoshudian_app_key = "1186159692"
# xiaoshudian_tec_api = ["tel","address_getbymobile","channel_NameIDCardAccountVerify","channel_cellphone","operator_capricorn","address_match","channel_idcard","channel_bankby3","channel_idNameFase","channel_criminal","channel_blacklistverify","credit_implement"]
# def filter(app_key,api):
#     not (app_key in xiaoshudian_app_key and api not in  xiaoshudian_tec_api)


data_rdd = data.rdd.map(lambda a: (a.app_key_param, a.date, standard_params(a.params), a.interface, a.api_type)) \
    .map(lambda (a, b, c, d, e): (a, b, c[0], c[1], c[2], c[3], d, e))
'''
c[0]->phone
c[1]->idcard
c[2]->idbank
c[3]->name
'''
schemaStr = "app_key date phone idcard idbank name interface api_type"
fields = [
    StructField(field_name, StringType(), True)
    for field_name in schemaStr.split()
]
schema = StructType(fields)

data_df = hc.createDataFrame(data_rdd, schema).distinct()
dfw = DataFrameWriter(data_df)
dfw.saveAsTable("wl_analysis.t_lel_record_data_backflow", mode="overwrite")
Exemple #4
0
    if ob.has_key("mobile"):
        phone = ob.get("mobile", None)
    elif ob.has_key("phone"):
        phone = ob.get("phone", None)
    elif ob.has_key("enc_m"):
        phone = ob.get("enc_m", None)
    elif ob.has_key("ownerMobile"):
        phone = ob.get("ownerMobile", None)
    elif ob.has_key("bankpremobile"):
        phone = ob.get("bankpremobile", None)
    else:
        phone = None
    return (phone, idCard, idBank, name)


# xiaoshudian_app_key = "1186159692"
# xiaoshudian_tec_api = ["tel","address_getbymobile","channel_NameIDCardAccountVerify","channel_cellphone","operator_capricorn","address_match","channel_idcard","channel_bankby3","channel_idNameFase","channel_criminal","channel_blacklistverify","credit_implement"]
# def filter(app_key,api):
#     not (app_key in xiaoshudian_app_key and api not in  xiaoshudian_tec_api)
data_rdd = data.rdd.map(lambda a:(a.app_key_param,a.date,standard_params(a.params_less),a.interface,a.api_type))\
    .map(lambda (a,b,c,d,e):(a,b,c[0],c[1],c[2],c[3],d,e))
schemaStr = "app_key date phone idcard idbank name interface api_type"
fields = [
    StructField(field_name, StringType(), True)
    for field_name in schemaStr.split()
]
schema = StructType(fields)
data_df = sqlContext.createDataFrame(data_rdd, schema).distinct()
dfw = DataFrameWriter(data_df)
dfw.saveAsTable("wl_analysis.t_lel_datamart_backflow_filtered",
                mode="overwrite")