from pyspark.sql import SQLContext from pyspark import SparkContext sc = SparkContext("local", "my app") sqlContext = SQLContext(sc)
from pyspark.sql.types import StructType, StructField, StringType rdd = sc.parallelize([(1, "John Doe", 25), (2, "Jane Smith", 23)]) schema = StructType([StructField("id", StringType(), True), StructField("name", StringType(), True), StructField("age", StringType(), True)]) df = sqlContext.createDataFrame(rdd, schema) df.registerTempTable("people") sqlContext.sql("SELECT * FROM people WHERE age > 24").show()
df = sqlContext.read.format("com.databricks.spark.csv") \ .option("header", "true") \ .option("inferSchema", "true") \ .load("myfile.csv") df.printSchema()In this example, we read a CSV file into a DataFrame using the `read()` method of SQLContext. We specify the file format as "com.databricks.spark.csv", and provide options to indicate that the first line of the file is the header row, and that the schema should be inferred from the data. We then print the schema of the resulting DataFrame using the `printSchema()` method. Package library: PySpark