Esempio n. 1
0
# print(df)

d = [{'name': 'paul', 'age': 10,'gender':'male'},{'name': 'alice', 'age': 30,'gender':None}]
print(sqlContext.createDataFrame(d).collect())
# 
rdd = sc.parallelize(l)
df = sqlContext.createDataFrame(rdd, ['name', 'age'])
# print(df.collect())
#df=sqlContext.createDataFrame(rdd)
# print(df.printSchema())
print(df.head(2))

sqlContext.registerDataFrameAsTable(df, "table1")
df2 = sqlContext.sql("SELECT name,age from table1 where name='bob'")
print(df2.collect())
print(sqlContext.tableNames())
sqlContext.dropTempTable("table1")
print(sqlContext.tableNames())


df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load("/home/harsh/mapping_minds_training/spark/train_u6lujuX_CVtuZ9i.csv")
print(df.groupBy('Gender').agg({'ApplicantIncome': 'mean'}).show())
print(df.head(3))
print(df.printSchema())
print(df.columns)
df.cache()
print('count-------------------------->',df.count())

#print(df.describe().show())
#print('distinct count------------------------------>',df.distinct().count())
#df.unpersist()
Esempio n. 2
0
 def spark_table_exists(sql_ctx: SQLContext, view: str) -> bool:
     """
     :return:
     """
     # noinspection PyBroadException
     return view in sql_ctx.tableNames()