def run(self):
     sc = SparkContext("local", "gender")
     sqlContext = SQLContext(sc)
     #StringType =(str, unicode)
     _out = self.output().open('w')
     #lines = sc.textFile("myUser.csv")
     #fobj = self.input().open("r")
     #lines = sc.textFile(fobj.name)
     print(type(self.required_tasks['insert_source'].output()))
     print(self.required_tasks['insert_source'])
     #print(self.input()['insert_source'].input())
     lines = sc.textFile("myUser.csv")
     parts = lines.map(lambda l: l.split(","))
     users = parts.map(lambda p: (p[0], p[1], p[2], p[3], p[4], p[5], p[
         6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], p[
             16], p[17], p[18], p[19]))
     schemaString = "userId lmsUserId lmsName orgName name gender registrationDate emailId mothertounge highestEduDegree goals city state active firstAccesDate lastAccessDate allowCert yearOfBirth pincode aadharId"
     print(schemaString)
     _out.write(schemaString)
     fields = [
         StructField(field_name, StringType(), True)
         for field_name in schemaString.split()
     ]
     schema = StructType(fields)
     #schemaUser = sqlContext.createDataFrame(users, schema)
     schemaUser = sqlContext.applySchema(users, schema)
     schemaUser.registerTempTable("users")
     results = sqlContext.sql("SELECT gender FROM users")
     genders = results.map(lambda p: (p, 1))
     counts = genders.reduceByKey(
         lambda a, b: a + b
     )  #.map(lambda t : ("Gender " + t(0) + " No " + t(1))).collect()
     for name in counts.collect():
         _out.write(str(name))
     _out.close()
 def run(self):
     sc = SparkContext("local", "gender")
     sqlContext = SQLContext(sc)
     #StringType =(str, unicode)
     _out = self.output().open('w')
     #lines = sc.textFile("myUser.csv")
     #fobj = self.input().open("r")
     #lines = sc.textFile(fobj.name)
     print(type(self.required_tasks['insert_source'].output()))
     print(self.required_tasks['insert_source'])
     #print(self.input()['insert_source'].input())
     lines = sc.textFile("myUser.csv")
     parts = lines.map(lambda l: l.split(","))
     users = parts.map(lambda p: (p[0], p[1],p[2],p[3],p[4],p[5],p[6],p[7],
         p[8],p[9],p[10],p[11],p[12],p[13],p[14],p[15],p[16],p[17],p[18],p[19]))
     schemaString = "userId lmsUserId lmsName orgName name gender registrationDate emailId mothertounge highestEduDegree goals city state active firstAccesDate lastAccessDate allowCert yearOfBirth pincode aadharId"
     print(schemaString)
     _out.write(schemaString )
     fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
     schema = StructType(fields)
     #schemaUser = sqlContext.createDataFrame(users, schema)
     schemaUser = sqlContext.applySchema(users, schema)
     schemaUser.registerTempTable("users")
     results = sqlContext.sql("SELECT gender FROM users")
     genders = results.map(lambda p : (p,1))
     counts = genders.reduceByKey(lambda a, b: a + b) #.map(lambda t : ("Gender " + t(0) + " No " + t(1))).collect()
     for name in counts.collect():
         _out.write(str(name))
     _out.close()
Exemple #3
0
    return list(reader)[0]


# Load File, remove header, and parse
file = sc.textFile(
    "hdfs://wolf.iems.northwestern.edu/user/huser88/crime/Crimes_-_2001_to_present.csv"
).zipWithIndex().filter(lambda x: x[1] > 0).map(csvParse)

# Create RDD with year and month
file1 = file.map(lambda x: Row(id=x[0], date=x[2])).cache()

# Prepare for sql queries
headers = "date id"
fields = [
    StructField(field_name, StringType(), True)
    for field_name in headers.split()
]
schema = StructType(fields)

schema_file = sqlContext.applySchema(file1, schema)
schema_file.registerTempTable("crime1")

# Get monthly average crime rate
crimeByMonth = sqlContext.sql(
    "SELECT substr(date, 0,2), COUNT(id)/COUNT(DISTINCT substr(date,7,4)) AS avgCrimeCnt FROM crime1 GROUP BY substr(date,0,2)"
)

# Print output to screen
for m in crimeByMonth.collect():
    print m
    return content, tweet_time


path = 'tweets_af'
tweet = sc.textFile(path).map(process)

sqlContext = SQLContext(sc)
schemaString = "text created_at"

fields = [
    StructField(field_name, StringType(), False)
    for field_name in schemaString.split()
]
schema = StructType(fields)

schemaTweet = sqlContext.applySchema(tweet, schema)

schemaTweet.registerTempTable("tweet")

#text = sqlContext.sql("SELECT count(1) FROM tweet WHERE text like '%upset%' and created_at = 20140609")
text = sqlContext.sql(
    "SELECT count(1), created_at FROM tweet GROUP BY created_at")
#text = sqlContext.sql("SELECT count(1), created_at FROM tweet WHERE text like '%hope%' GROUP BY created_at")
#text = sqlContext.sql("SELECT count(1), created_at FROM tweet WHERE text like '%happy%' GROUP BY created_at")
#text = sqlContext.sql("SELECT count(1), created_at FROM tweet WHERE text like '%fear%' GROUP BY created_at")
#text = sqlContext.sql("SELECT count(1), created_at FROM tweet WHERE text like '%worry%' GROUP BY created_at")
#text = sqlContext.sql("SELECT count(1), created_at FROM tweet WHERE text like '%nervous%' GROUP BY created_at")
#text = sqlContext.sql("SELECT count(1), created_at FROM tweet WHERE text like '%anxious%' GROUP BY created_at")
#text = sqlContext.sql("SELECT count(1), created_at FROM tweet WHERE text like '%upset%' GROUP BY created_at")

text.collect()
Exemple #5
0
    # RDD is created from a list of rows
    some_rdd = sc.parallelize([Row(name="John", age=19),
                              Row(name="Smith", age=23),
                              Row(name="Sarah", age=18)])
    # Infer schema from the first row, create a DataFrame and print the schema
    some_df = sqlContext.inferSchema(some_rdd)
    some_df.printSchema()

    # Another RDD is created from a list of tuples
    another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)])
    # Schema with two fields - person_name and person_age
    schema = StructType([StructField("person_name", StringType(), False),
                        StructField("person_age", IntegerType(), False)])
    # Create a DataFrame by applying the schema to the RDD and print the schema
    another_df = sqlContext.applySchema(another_rdd, schema)
    another_df.printSchema()
    # root
    #  |-- age: integer (nullable = true)
    #  |-- name: string (nullable = true)

    # A JSON dataset is pointed to by path.
    # The path can be either a single text file or a directory storing text files.
    path = os.path.join(os.environ['SPARK_HOME'], "examples/src/main/resources/people.json")
    # Create a DataFrame from the file(s) pointed to by path
    people = sqlContext.jsonFile(path)
    # root
    #  |-- person_name: string (nullable = false)
    #  |-- person_age: integer (nullable = false)

    # The inferred schema can be visualized using the printSchema() method.
sqlContext = SQLContext(sc)

def csvParse( tup ):
    line = tup[ 0 ];
    reader = csv.reader( [ line ] );
    return list( reader )[ 0 ];



# Load File, remove header, and parse
file = sc.textFile("hdfs://wolf.iems.northwestern.edu/user/huser88/crime/Crimes_-_2001_to_present.csv").zipWithIndex().filter(lambda x: x[ 1 ] > 0 ).map(csvParse)

# Create RDD with year and month
file1 = file.map(lambda x: Row(id=x[0], date=x[2])).cache()

# Prepare for sql queries
headers = "date id"
fields = [StructField(field_name, StringType(), True) for field_name in headers.split()]
schema = StructType(fields)

schema_file = sqlContext.applySchema(file1, schema)
schema_file.registerTempTable("crime1")

# Get monthly average crime rate
crimeByMonth = sqlContext.sql("SELECT substr(date, 0,2), COUNT(id)/COUNT(DISTINCT substr(date,7,4)) AS avgCrimeCnt FROM crime1 GROUP BY substr(date,0,2)")

# Print output to screen
for m in crimeByMonth.collect():
	print m

# coding: utf-8

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from decimal import Decimal
from pyspark.sql.types import StructType, StructField, DecimalType

conf = SparkConf().setAppName("spark_sql_datatype_decimal")

sc = SparkContext(conf=conf)

hc = SQLContext(sc)

source = sc.parallelize([(Decimal("1.0"), Decimal("2.0"))])

schema = StructType([
    StructField("col1", DecimalType(), False),
    StructField("col2", DecimalType(), False)
])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql("select col1 + col2, col2 + 1.0 from temp_table").collect()

sc.stop()

for row in rows:
    print row
Exemple #8
0
    # RDD is created from a list of rows
    some_rdd = sc.parallelize([Row(name="John", age=19),
                              Row(name="Smith", age=23),
                              Row(name="Sarah", age=18)])
    # Infer schema from the first row, create a SchemaRDD and print the schema
    some_schemardd = sqlContext.inferSchema(some_rdd)
    some_schemardd.printSchema()

    # Another RDD is created from a list of tuples
    another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)])
    # Schema with two fields - person_name and person_age
    schema = StructType([StructField("person_name", StringType(), False),
                        StructField("person_age", IntegerType(), False)])
    # Create a SchemaRDD by applying the schema to the RDD and print the schema
    another_schemardd = sqlContext.applySchema(another_rdd, schema)
    another_schemardd.printSchema()
    # root
    #  |-- age: integer (nullable = true)
    #  |-- name: string (nullable = true)

    # A JSON dataset is pointed to by path.
    # The path can be either a single text file or a directory storing text files.
    path = os.path.join(os.environ['SPARK_HOME'], "examples/src/main/resources/people.json")
    # Create a SchemaRDD from the file(s) pointed to by path
    people = sqlContext.jsonFile(path)
    # root
    #  |-- person_name: string (nullable = false)
    #  |-- person_age: integer (nullable = false)

    # The inferred schema can be visualized using the printSchema() method.
# coding: utf-8

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from decimal import Decimal
from pyspark.sql.types import StructType, StructField, DecimalType

conf = SparkConf().setAppName("spark_sql_datatype_decimal")

sc = SparkContext(conf=conf)

hc = SQLContext(sc)

source = sc.parallelize(
    [(Decimal("1.0"), Decimal("2.0"))])

schema = StructType([StructField("col1", DecimalType(), False),
                     StructField("col2", DecimalType(), False)])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql(
    "select col1 + col2, col2 + 1.0 from temp_table").collect()

sc.stop()

for row in rows:
    print row
Exemple #10
0
                        StructField("UserLocation", StringType(), False),  \
                        StructField("Accuracy", IntegerType(), False),  \
                        StructField("BrowsingSession", IntegerType(), False),  \
                        StructField("Uplink", IntegerType(), False),  \
                        StructField("Downlink", IntegerType(), False), \
                        StructField("Urls",ArrayType( \
                                                     StructType([StructField("name", StringType(), False),StructField("domain", StringType(), True), \
                                                                    StructField("categories", ArrayType(StringType(), True), True)]), False), \
                                                True)])

    #get the important fields
    resultMap_newFormat = log.map(lambda line: line.split(";")).filter(lambda line: len(line)>1). \
                                 map(lambda row: (str(row[0]), \
                                     int(row[1]), \
                                     int(row[2]), \
                                     getData(row[3]), \
                                     int(row[4]), \
                                     str(row[5]), \
                                     int(row[6]), \
                                     int(row[7]), \
                                     int(row[8]), \
                                     int(row[9]), \
                                     getCategory(row[9])))

    schemaDataFrame = sqlContext.applySchema(resultMap_newFormat, newStructure)
    #schemaDataFrame= sqlContext.applySchema(resultMap_FilterUlr, fields)
    data = schemaDataFrame.toJSON()
    data.saveAsTextFile("result" + str(time.time()))

    sc.stop()
Exemple #11
0
        Row(name="Smith", age=23),
        Row(name="Sarah", age=18)
    ])
    # Infer schema from the first row, create a DataFrame and print the schema
    some_df = sqlContext.inferSchema(some_rdd)
    some_df.printSchema()

    # Another RDD is created from a list of tuples
    another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)])
    # Schema with two fields - person_name and person_age
    schema = StructType([
        StructField("person_name", StringType(), False),
        StructField("person_age", IntegerType(), False)
    ])
    # Create a DataFrame by applying the schema to the RDD and print the schema
    another_df = sqlContext.applySchema(another_rdd, schema)
    another_df.printSchema()
    # root
    #  |-- age: integer (nullable = true)
    #  |-- name: string (nullable = true)

    # A JSON dataset is pointed to by path.
    # The path can be either a single text file or a directory storing text files.
    path = os.path.join(os.environ['SPARK_HOME'],
                        "examples/src/main/resources/people.json")
    # Create a DataFrame from the file(s) pointed to by path
    people = sqlContext.jsonFile(path)
    # root
    #  |-- person_name: string (nullable = false)
    #  |-- person_age: integer (nullable = false)
Exemple #12
0
lines = sc.textFile("file:///" +
                    "C:/coding/Hadoop/pig/MapReduceInputData/VH_Formtype.txt")
messages = lines.map(lambda l: l.split("\t"))
messages_subset = messages.map(lambda p: Row(formtypename=p[1]))

# See example: http://spark.apache.org/docs/latest/sql-programming-guide.html
schema_messages = sqlContext.inferSchema(messages_subset)
# NOTE: inferSchema is deprecated, please use createDataFrame instead

schemaString = "ip user date time"
fields = [
    StructField(field_name, StringType(), True)
    for field_name in schemaString.split()
]
schema = StructType(fields)
schema_messages = sqlContext.applySchema(messages_subset, schema)
# NOTE: applySchema is deprecated, please use createDataFrame instead

schema_messages.registerTempTable("messages_subset")

# Ex 1
data = sqlContext.sql("SELECT * FROM messages_subset"
                      )  # Can then use RDD operations on the returned RDD
# #x 2
data = sqlContext.sql(
    """SELECT formtypename, count(formtypename) AS processed FROM
    messages_subset GROUP BY formtypename ORDER BY formtypename""")
data2 = data.map(lambda r: r).collect()
for d in data2:  # An RDD(?) on Row objects. TODO: How to convert from Row?
    print d[0], d[1]
formtypes = {}  # Add formtypes to a dictionary
Exemple #13
0
messages = lines.map(lambda l: l.split(","))
messages_subset = messages.map(lambda p: Row(ip=p[0], user=p[1], date=p[2], time=p[3]))    

# Ex 2
lines = sc.textFile("file:///" + "C:/coding/Hadoop/pig/MapReduceInputData/VH_Formtype.txt")
messages = lines.map(lambda l: l.split("\t"))
messages_subset = messages.map(lambda p: Row(formtypename=p[1]))    

# See example: http://spark.apache.org/docs/latest/sql-programming-guide.html 
schema_messages = sqlContext.inferSchema(messages_subset)
# NOTE: inferSchema is deprecated, please use createDataFrame instead

schemaString = "ip user date time"
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)
schema_messages = sqlContext.applySchema(messages_subset, schema)
# NOTE: applySchema is deprecated, please use createDataFrame instead

schema_messages.registerTempTable("messages_subset")

# Ex 1
data = sqlContext.sql("SELECT * FROM messages_subset") # Can then use RDD operations on the returned RDD
# #x 2
data = sqlContext.sql("""SELECT formtypename, count(formtypename) AS processed FROM
    messages_subset GROUP BY formtypename ORDER BY formtypename""")
data2 = data.map(lambda r: r).collect()
for d in data2: # An RDD(?) on Row objects. TODO: How to convert from Row?
    print d[0], d[1]
formtypes = {} # Add formtypes to a dictionary
for d in data2:
    formtypes[d[0]] = d[1]