def run(self): sc = SparkContext("local", "gender") sqlContext = SQLContext(sc) #StringType =(str, unicode) _out = self.output().open('w') #lines = sc.textFile("myUser.csv") #fobj = self.input().open("r") #lines = sc.textFile(fobj.name) print(type(self.required_tasks['insert_source'].output())) print(self.required_tasks['insert_source']) #print(self.input()['insert_source'].input()) lines = sc.textFile("myUser.csv") parts = lines.map(lambda l: l.split(",")) users = parts.map(lambda p: (p[0], p[1], p[2], p[3], p[4], p[5], p[ 6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], p[ 16], p[17], p[18], p[19])) schemaString = "userId lmsUserId lmsName orgName name gender registrationDate emailId mothertounge highestEduDegree goals city state active firstAccesDate lastAccessDate allowCert yearOfBirth pincode aadharId" print(schemaString) _out.write(schemaString) fields = [ StructField(field_name, StringType(), True) for field_name in schemaString.split() ] schema = StructType(fields) #schemaUser = sqlContext.createDataFrame(users, schema) schemaUser = sqlContext.applySchema(users, schema) schemaUser.registerTempTable("users") results = sqlContext.sql("SELECT gender FROM users") genders = results.map(lambda p: (p, 1)) counts = genders.reduceByKey( lambda a, b: a + b ) #.map(lambda t : ("Gender " + t(0) + " No " + t(1))).collect() for name in counts.collect(): _out.write(str(name)) _out.close()
def run(self): sc = SparkContext("local", "gender") sqlContext = SQLContext(sc) #StringType =(str, unicode) _out = self.output().open('w') #lines = sc.textFile("myUser.csv") #fobj = self.input().open("r") #lines = sc.textFile(fobj.name) print(type(self.required_tasks['insert_source'].output())) print(self.required_tasks['insert_source']) #print(self.input()['insert_source'].input()) lines = sc.textFile("myUser.csv") parts = lines.map(lambda l: l.split(",")) users = parts.map(lambda p: (p[0], p[1],p[2],p[3],p[4],p[5],p[6],p[7], p[8],p[9],p[10],p[11],p[12],p[13],p[14],p[15],p[16],p[17],p[18],p[19])) schemaString = "userId lmsUserId lmsName orgName name gender registrationDate emailId mothertounge highestEduDegree goals city state active firstAccesDate lastAccessDate allowCert yearOfBirth pincode aadharId" print(schemaString) _out.write(schemaString ) fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()] schema = StructType(fields) #schemaUser = sqlContext.createDataFrame(users, schema) schemaUser = sqlContext.applySchema(users, schema) schemaUser.registerTempTable("users") results = sqlContext.sql("SELECT gender FROM users") genders = results.map(lambda p : (p,1)) counts = genders.reduceByKey(lambda a, b: a + b) #.map(lambda t : ("Gender " + t(0) + " No " + t(1))).collect() for name in counts.collect(): _out.write(str(name)) _out.close()
return list(reader)[0] # Load File, remove header, and parse file = sc.textFile( "hdfs://wolf.iems.northwestern.edu/user/huser88/crime/Crimes_-_2001_to_present.csv" ).zipWithIndex().filter(lambda x: x[1] > 0).map(csvParse) # Create RDD with year and month file1 = file.map(lambda x: Row(id=x[0], date=x[2])).cache() # Prepare for sql queries headers = "date id" fields = [ StructField(field_name, StringType(), True) for field_name in headers.split() ] schema = StructType(fields) schema_file = sqlContext.applySchema(file1, schema) schema_file.registerTempTable("crime1") # Get monthly average crime rate crimeByMonth = sqlContext.sql( "SELECT substr(date, 0,2), COUNT(id)/COUNT(DISTINCT substr(date,7,4)) AS avgCrimeCnt FROM crime1 GROUP BY substr(date,0,2)" ) # Print output to screen for m in crimeByMonth.collect(): print m
return content, tweet_time path = 'tweets_af' tweet = sc.textFile(path).map(process) sqlContext = SQLContext(sc) schemaString = "text created_at" fields = [ StructField(field_name, StringType(), False) for field_name in schemaString.split() ] schema = StructType(fields) schemaTweet = sqlContext.applySchema(tweet, schema) schemaTweet.registerTempTable("tweet") #text = sqlContext.sql("SELECT count(1) FROM tweet WHERE text like '%upset%' and created_at = 20140609") text = sqlContext.sql( "SELECT count(1), created_at FROM tweet GROUP BY created_at") #text = sqlContext.sql("SELECT count(1), created_at FROM tweet WHERE text like '%hope%' GROUP BY created_at") #text = sqlContext.sql("SELECT count(1), created_at FROM tweet WHERE text like '%happy%' GROUP BY created_at") #text = sqlContext.sql("SELECT count(1), created_at FROM tweet WHERE text like '%fear%' GROUP BY created_at") #text = sqlContext.sql("SELECT count(1), created_at FROM tweet WHERE text like '%worry%' GROUP BY created_at") #text = sqlContext.sql("SELECT count(1), created_at FROM tweet WHERE text like '%nervous%' GROUP BY created_at") #text = sqlContext.sql("SELECT count(1), created_at FROM tweet WHERE text like '%anxious%' GROUP BY created_at") #text = sqlContext.sql("SELECT count(1), created_at FROM tweet WHERE text like '%upset%' GROUP BY created_at") text.collect()
# RDD is created from a list of rows some_rdd = sc.parallelize([Row(name="John", age=19), Row(name="Smith", age=23), Row(name="Sarah", age=18)]) # Infer schema from the first row, create a DataFrame and print the schema some_df = sqlContext.inferSchema(some_rdd) some_df.printSchema() # Another RDD is created from a list of tuples another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)]) # Schema with two fields - person_name and person_age schema = StructType([StructField("person_name", StringType(), False), StructField("person_age", IntegerType(), False)]) # Create a DataFrame by applying the schema to the RDD and print the schema another_df = sqlContext.applySchema(another_rdd, schema) another_df.printSchema() # root # |-- age: integer (nullable = true) # |-- name: string (nullable = true) # A JSON dataset is pointed to by path. # The path can be either a single text file or a directory storing text files. path = os.path.join(os.environ['SPARK_HOME'], "examples/src/main/resources/people.json") # Create a DataFrame from the file(s) pointed to by path people = sqlContext.jsonFile(path) # root # |-- person_name: string (nullable = false) # |-- person_age: integer (nullable = false) # The inferred schema can be visualized using the printSchema() method.
sqlContext = SQLContext(sc) def csvParse( tup ): line = tup[ 0 ]; reader = csv.reader( [ line ] ); return list( reader )[ 0 ]; # Load File, remove header, and parse file = sc.textFile("hdfs://wolf.iems.northwestern.edu/user/huser88/crime/Crimes_-_2001_to_present.csv").zipWithIndex().filter(lambda x: x[ 1 ] > 0 ).map(csvParse) # Create RDD with year and month file1 = file.map(lambda x: Row(id=x[0], date=x[2])).cache() # Prepare for sql queries headers = "date id" fields = [StructField(field_name, StringType(), True) for field_name in headers.split()] schema = StructType(fields) schema_file = sqlContext.applySchema(file1, schema) schema_file.registerTempTable("crime1") # Get monthly average crime rate crimeByMonth = sqlContext.sql("SELECT substr(date, 0,2), COUNT(id)/COUNT(DISTINCT substr(date,7,4)) AS avgCrimeCnt FROM crime1 GROUP BY substr(date,0,2)") # Print output to screen for m in crimeByMonth.collect(): print m
# coding: utf-8 from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext from decimal import Decimal from pyspark.sql.types import StructType, StructField, DecimalType conf = SparkConf().setAppName("spark_sql_datatype_decimal") sc = SparkContext(conf=conf) hc = SQLContext(sc) source = sc.parallelize([(Decimal("1.0"), Decimal("2.0"))]) schema = StructType([ StructField("col1", DecimalType(), False), StructField("col2", DecimalType(), False) ]) table = hc.applySchema(source, schema) table.registerAsTable("temp_table") rows = hc.sql("select col1 + col2, col2 + 1.0 from temp_table").collect() sc.stop() for row in rows: print row
# RDD is created from a list of rows some_rdd = sc.parallelize([Row(name="John", age=19), Row(name="Smith", age=23), Row(name="Sarah", age=18)]) # Infer schema from the first row, create a SchemaRDD and print the schema some_schemardd = sqlContext.inferSchema(some_rdd) some_schemardd.printSchema() # Another RDD is created from a list of tuples another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)]) # Schema with two fields - person_name and person_age schema = StructType([StructField("person_name", StringType(), False), StructField("person_age", IntegerType(), False)]) # Create a SchemaRDD by applying the schema to the RDD and print the schema another_schemardd = sqlContext.applySchema(another_rdd, schema) another_schemardd.printSchema() # root # |-- age: integer (nullable = true) # |-- name: string (nullable = true) # A JSON dataset is pointed to by path. # The path can be either a single text file or a directory storing text files. path = os.path.join(os.environ['SPARK_HOME'], "examples/src/main/resources/people.json") # Create a SchemaRDD from the file(s) pointed to by path people = sqlContext.jsonFile(path) # root # |-- person_name: string (nullable = false) # |-- person_age: integer (nullable = false) # The inferred schema can be visualized using the printSchema() method.
# coding: utf-8 from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext from decimal import Decimal from pyspark.sql.types import StructType, StructField, DecimalType conf = SparkConf().setAppName("spark_sql_datatype_decimal") sc = SparkContext(conf=conf) hc = SQLContext(sc) source = sc.parallelize( [(Decimal("1.0"), Decimal("2.0"))]) schema = StructType([StructField("col1", DecimalType(), False), StructField("col2", DecimalType(), False)]) table = hc.applySchema(source, schema) table.registerAsTable("temp_table") rows = hc.sql( "select col1 + col2, col2 + 1.0 from temp_table").collect() sc.stop() for row in rows: print row
StructField("UserLocation", StringType(), False), \ StructField("Accuracy", IntegerType(), False), \ StructField("BrowsingSession", IntegerType(), False), \ StructField("Uplink", IntegerType(), False), \ StructField("Downlink", IntegerType(), False), \ StructField("Urls",ArrayType( \ StructType([StructField("name", StringType(), False),StructField("domain", StringType(), True), \ StructField("categories", ArrayType(StringType(), True), True)]), False), \ True)]) #get the important fields resultMap_newFormat = log.map(lambda line: line.split(";")).filter(lambda line: len(line)>1). \ map(lambda row: (str(row[0]), \ int(row[1]), \ int(row[2]), \ getData(row[3]), \ int(row[4]), \ str(row[5]), \ int(row[6]), \ int(row[7]), \ int(row[8]), \ int(row[9]), \ getCategory(row[9]))) schemaDataFrame = sqlContext.applySchema(resultMap_newFormat, newStructure) #schemaDataFrame= sqlContext.applySchema(resultMap_FilterUlr, fields) data = schemaDataFrame.toJSON() data.saveAsTextFile("result" + str(time.time())) sc.stop()
Row(name="Smith", age=23), Row(name="Sarah", age=18) ]) # Infer schema from the first row, create a DataFrame and print the schema some_df = sqlContext.inferSchema(some_rdd) some_df.printSchema() # Another RDD is created from a list of tuples another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)]) # Schema with two fields - person_name and person_age schema = StructType([ StructField("person_name", StringType(), False), StructField("person_age", IntegerType(), False) ]) # Create a DataFrame by applying the schema to the RDD and print the schema another_df = sqlContext.applySchema(another_rdd, schema) another_df.printSchema() # root # |-- age: integer (nullable = true) # |-- name: string (nullable = true) # A JSON dataset is pointed to by path. # The path can be either a single text file or a directory storing text files. path = os.path.join(os.environ['SPARK_HOME'], "examples/src/main/resources/people.json") # Create a DataFrame from the file(s) pointed to by path people = sqlContext.jsonFile(path) # root # |-- person_name: string (nullable = false) # |-- person_age: integer (nullable = false)
lines = sc.textFile("file:///" + "C:/coding/Hadoop/pig/MapReduceInputData/VH_Formtype.txt") messages = lines.map(lambda l: l.split("\t")) messages_subset = messages.map(lambda p: Row(formtypename=p[1])) # See example: http://spark.apache.org/docs/latest/sql-programming-guide.html schema_messages = sqlContext.inferSchema(messages_subset) # NOTE: inferSchema is deprecated, please use createDataFrame instead schemaString = "ip user date time" fields = [ StructField(field_name, StringType(), True) for field_name in schemaString.split() ] schema = StructType(fields) schema_messages = sqlContext.applySchema(messages_subset, schema) # NOTE: applySchema is deprecated, please use createDataFrame instead schema_messages.registerTempTable("messages_subset") # Ex 1 data = sqlContext.sql("SELECT * FROM messages_subset" ) # Can then use RDD operations on the returned RDD # #x 2 data = sqlContext.sql( """SELECT formtypename, count(formtypename) AS processed FROM messages_subset GROUP BY formtypename ORDER BY formtypename""") data2 = data.map(lambda r: r).collect() for d in data2: # An RDD(?) on Row objects. TODO: How to convert from Row? print d[0], d[1] formtypes = {} # Add formtypes to a dictionary
messages = lines.map(lambda l: l.split(",")) messages_subset = messages.map(lambda p: Row(ip=p[0], user=p[1], date=p[2], time=p[3])) # Ex 2 lines = sc.textFile("file:///" + "C:/coding/Hadoop/pig/MapReduceInputData/VH_Formtype.txt") messages = lines.map(lambda l: l.split("\t")) messages_subset = messages.map(lambda p: Row(formtypename=p[1])) # See example: http://spark.apache.org/docs/latest/sql-programming-guide.html schema_messages = sqlContext.inferSchema(messages_subset) # NOTE: inferSchema is deprecated, please use createDataFrame instead schemaString = "ip user date time" fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()] schema = StructType(fields) schema_messages = sqlContext.applySchema(messages_subset, schema) # NOTE: applySchema is deprecated, please use createDataFrame instead schema_messages.registerTempTable("messages_subset") # Ex 1 data = sqlContext.sql("SELECT * FROM messages_subset") # Can then use RDD operations on the returned RDD # #x 2 data = sqlContext.sql("""SELECT formtypename, count(formtypename) AS processed FROM messages_subset GROUP BY formtypename ORDER BY formtypename""") data2 = data.map(lambda r: r).collect() for d in data2: # An RDD(?) on Row objects. TODO: How to convert from Row? print d[0], d[1] formtypes = {} # Add formtypes to a dictionary for d in data2: formtypes[d[0]] = d[1]