def main(*argv): #Setup to read from s3 aws_access_key = os.getenv('AWS_ACCESS_KEY_ID', 'default') aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY', 'default') conf = SparkConf().setAppName("taxishift1") SparkContext.setSystemProperty('spark.executor.memory', '5g') sc = SparkContext(conf=conf) sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId",aws_access_key) sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey",aws_secret_access_key) #Read from s3 for i in range(1,13): raw_data = sc.hadoopFile('s3a://ddrum-s3/trip_data/trip_data_' + str(i) + '.csv',\ 'org.apache.hadoop.mapred.TextInputFormat',\ 'org.apache.hadoop.io.Text',\ 'org.apache.hadoop.io.LongWritable') #Call user defined function to map raw data into key-value pairs new_data = raw_data.map(lambda x:splitit(x[1])) #Combine data from multiple csv files if i<2: total_data = new_data else: total_data = total_data.union(new_data) #Create total list of rides/shifts for each driver total_data = total_data.reduceByKey(lambda x,y:x+y)\ .mapValues(lambda x:mergeit(x))\ #Create key-value for each shift, and set shifts greater than 10 hours to 10 hours (We want to know exactly when they go over) ungrouped_data = total_data.flatMap(lambda x: [(x[0],r) for r in x[1]])\ .mapValues(lambda x: x if x[1]-x[0] < 36000 else [x[0],x[0]+36000]) #Extract only 10 hour shift offenders offenders = ungrouped_data.filter(lambda x:x[1][1] - x[1][0] >= 36000)\ .map(lambda x:(x[1][1]))\ .collect() #Plot number of offenders for every 30 minutes offender_hist = plt.hist(offenders, bins=range(int(min(offenders)), int(max(offenders)) + 1800, 1800)) hist_min = int(min(offender_hist)) hist_min+=900 #Save to file hist_csv = open('/home/ubuntu/offenders.csv','w') for i in offender_hist[0]: hist_csv.write(str(hist_min)+','+str(i)+'\n') hist_min+=1800 hist_csv.close() return
def main(*argv): #Setup to read from s3 aws_access_key = os.getenv('AWS_ACCESS_KEY_ID', 'default') aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY', 'default') conf = SparkConf().setAppName("taxishift3") SparkContext.setSystemProperty('spark.executor.memory', '5g') sc = SparkContext(conf=conf) sc._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", aws_access_key) sc._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey", aws_secret_access_key) #Read from s3 for i in range(1, 13): raw_data = sc.hadoopFile('s3a://ddrum-s3/trip_data/trip_data_' + str(i) + '.csv',\ 'org.apache.hadoop.mapred.TextInputFormat',\ 'org.apache.hadoop.io.Text',\ 'org.apache.hadoop.io.LongWritable') #Call user defined function to map raw data into key-value pairs new_data = raw_data.map(lambda x: splitit(x[1])) #Combine data from multiple csv files if i < 2: total_data = new_data else: total_data = total_data.union(new_data) #Create total list of pickup chains and pickups for each coordinate #If there are more than three times as many pickups as pickup chains, this might be a taxistand total_data = total_data.reduceByKey(lambda x,y:x+y)\ .mapValues(lambda x:mergeit(x))\ .filter(lambda x:x[1][1]/x[1][0] > 3)\ .collect() #Print results total_data.sort() for i in total_data: print i return
from pyspark import SparkConf, SparkContext conf = SparkConf().setAppName("spark_app_read_data_from_rcfile") sc = SparkContext(conf=conf) rowRDD = sc.hadoopFile(path="hdfs://dip.cdh5.dev:8020/user/yurun/rcfile", inputFormatClass="org.apache.hadoop.hive.ql.io.RCFileInputFormat", keyClass="org.apache.hadoop.io.LongWritable", valueClass="org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable", valueConverter="com.sina.dip.spark.converter.BytesRefArrayWritableToObjectArrayConverter") pairs = rowRDD.collect() for pair in pairs: print pair[0], pair[1] sc.stop()
def main(cmd_line_args=None): if cmd_line_args is None: cmd_line_args = sys.argv[1:] parser = _make_arg_parser() args = parser.parse_args(cmd_line_args) if args.num_reducers is not None and args.num_reducers <= 0: raise ValueError( 'You can only configure num_reducers to positive number.') # get job_class job_module_name, job_class_name = args.job_class.rsplit('.', 1) job_module = import_module(job_module_name) job_class = getattr(job_module, job_class_name) # load initial data from pyspark import SparkContext if args.job_args: job_args = shlex_split(args.job_args) else: job_args = [] # get job steps. don't pass --steps, which is deprecated job = job_class(job_args) steps = job.steps() # pick steps start = args.first_step_num or 0 end = None if args.last_step_num is None else args.last_step_num + 1 steps_to_run = list(enumerate(steps))[start:end] sc = SparkContext() # keep track of one set of counters per job step counter_accumulators = [ sc.accumulator(defaultdict(dict), CounterAccumulator()) for _ in steps_to_run ] def make_increment_counter(step_num): counter_accumulator = counter_accumulators[step_num - start] def increment_counter(group, name, amount=1): counter_accumulator.add({group: {name: amount}}) return increment_counter def make_job(mrc, step_num): j = job_class(job_args + ['--%s' % mrc, '--step-num=%d' % step_num]) j.sandbox() # so Spark doesn't try to serialize stdin # patch increment_counter() to update the accumulator for this step j.increment_counter = make_increment_counter(step_num) return j try: if job.hadoop_input_format() is not None: rdd = sc.hadoopFile(args.input_path, inputFormatClass=job.hadoop_input_format(), keyClass='org.apache.hadoop.io.Text', valueClass='org.apache.hadoop.io.Text') # hadoopFile loads each line as a key-value pair in which the contents # of the line are the key and the value is an empty string. Convert to # an rdd of just lines, encoded as bytes. rdd = rdd.map(lambda kv: kv[0].encode('utf-8')) else: rdd = sc.textFile(args.input_path, use_unicode=False) # run steps for step_num, step in steps_to_run: rdd = _run_step(step, step_num, rdd, make_job, args.num_reducers) # write the results if job.hadoop_output_format() is not None: # saveAsHadoopFile takes an rdd of key-value pairs, so convert to that # format rdd = rdd.map(lambda line: tuple( x.decode('utf-8') for x in line.split(b'\t', 1))) rdd.saveAsHadoopFile(args.output_path, outputFormatClass=job.hadoop_output_format(), compressionCodecClass=args.compression_codec) else: rdd.saveAsTextFile(args.output_path, compressionCodecClass=args.compression_codec) finally: if args.counter_output_dir is not None: counters = [ca.value for ca in counter_accumulators] sc.parallelize([json.dumps(counters)], numSlices=1).saveAsTextFile(args.counter_output_dir)
if __name__ == '__main__': if len(sys.argv) != 5: print("Usage: spark_streaming.py <master> <begin> <end> <input>", file=sys.stderr) exit(-1) master, time_begin, time_end, input = sys.argv[1:] input_path = input + '/' + time_begin + '.csv' logger.info("--->" + master + " " + input_path) sc = SparkContext(master, 'wxcity_userlogin_repeat_app') sql_context = SQLContext(sc) lines = sc.hadoopFile(input, 'org.apache.hadoop.mapred.TextInputFormat', 'org.apache.hadoop.io.LongWritable', 'org.apache.hadoop.io.Text' ) rs_tuples = MysqlDao().findWithQuery(ConfigPortalSql.select_mysql_hos_gw_sup) gwid_hosid_dict = {} for r in rs_tuples: hos_id = str(r[0]) gw_id = r[1] gwid_hosid_dict[gw_id] = hos_id logger.debug('-->gwid_hosid:' + str(gwid_hosid_dict.__len__())) users = lines.map(lambda x: x[1].split(',')).filter(lambda x: len(x) == 17) \ .map(lambda p: (p[0].strip(), p[1].strip(), p[2].strip(), p[3].strip(), p[4].strip(), \ p[5].strip(), p[6].strip(), p[7].strip(), p[8].strip(), p[9].strip(), \ p[10].strip(), p[11].strip(), p[12].strip(), p[13].strip(), p[14].strip(), \ p[15].strip(), p[16].strip(), gwid_hosid_dict.get(p[1].strip(), "")))
def main(cmd_line_args=None): if cmd_line_args is None: cmd_line_args = sys.argv[1:] parser = _make_arg_parser() args = parser.parse_args(cmd_line_args) if args.num_reducers is not None and args.num_reducers <= 0: raise ValueError( 'You can only configure num_reducers to positive number.') # get job_class job_module_name, job_class_name = args.job_class.rsplit('.', 1) job_module = import_module(job_module_name) job_class = getattr(job_module, job_class_name) # load initial data from pyspark import SparkContext if args.job_args: job_args = shlex_split(args.job_args) else: job_args = [] # determine hadoop_*_format, steps # try to avoid instantiating a job in the driver; see #2044 job = None if args.hadoop_input_format is None: job = job or job_class(job_args) hadoop_input_format = job.hadoop_input_format() else: hadoop_input_format = args.hadoop_input_format or None if args.hadoop_output_format is None: job = job or job_class(job_args) hadoop_output_format = job.hadoop_output_format() else: hadoop_output_format = args.hadoop_output_format or None if args.sort_values is None: job = job or job_class(job_args) sort_values = job.sort_values() else: sort_values = args.sort_values if args.steps_desc is None: job = job or job_class(job_args) steps = [ step.description(step_num) for step_num, step in enumerate(job.steps()) ] else: steps = json.loads(args.steps_desc) # pick steps start = args.first_step_num or 0 end = None if args.last_step_num is None else args.last_step_num + 1 steps_to_run = list(enumerate(steps))[start:end] sc = SparkContext() # keep track of one set of counters per job step counter_accumulators = [ sc.accumulator(defaultdict(dict), CounterAccumulator()) for _ in steps_to_run ] def make_increment_counter(step_num): counter_accumulator = counter_accumulators[step_num - start] def increment_counter(group, counter, amount=1): counter_accumulator.add({group: {counter: amount}}) return increment_counter def make_mrc_job(mrc, step_num): j = job_class(job_args + ['--%s' % mrc, '--step-num=%d' % step_num]) # patch increment_counter() to update the accumulator for this step j.increment_counter = make_increment_counter(step_num) # if skip_internal_protocol is true, patch internal_protocol() to # return an object whose *read* and *write* attributes are ``None`` if args.skip_internal_protocol: j.internal_protocol = lambda: _NO_INTERNAL_PROTOCOL return j # --emulate-map-input-file doesn't work with hadoop_input_format emulate_map_input_file = (args.emulate_map_input_file and not hadoop_input_format) try: if emulate_map_input_file: # load an rdd with pairs of (input_path, line). *path* here # has to be a single path, not a comma-separated list rdd = sc.union([ _text_file_with_path(sc, path) for path in args.input_path.split(',') ]) elif hadoop_input_format: rdd = sc.hadoopFile(args.input_path, inputFormatClass=hadoop_input_format, keyClass='org.apache.hadoop.io.Text', valueClass='org.apache.hadoop.io.Text') # hadoopFile loads each line as a key-value pair in which the # contents of the line are the key and the value is an empty # string. Convert to an rdd of just lines, encoded as bytes. rdd = rdd.map(lambda kv: kv[0].encode('utf-8')) else: rdd = sc.textFile(args.input_path, use_unicode=False) # run steps for step_num, step in steps_to_run: rdd = _run_step(step, step_num, rdd, make_mrc_job, args.num_reducers, sort_values, emulate_map_input_file, args.skip_internal_protocol) # max_output_files: limit number of partitions if args.max_output_files: rdd = rdd.coalesce(args.max_output_files) # write the results if hadoop_output_format: # saveAsHadoopFile takes an rdd of key-value pairs, so convert to # that format rdd = rdd.map(lambda line: tuple( x.decode('utf-8') for x in line.split(b'\t', 1))) rdd.saveAsHadoopFile(args.output_path, outputFormatClass=hadoop_output_format, compressionCodecClass=args.compression_codec) else: rdd.saveAsTextFile(args.output_path, compressionCodecClass=args.compression_codec) finally: if args.counter_output_dir is not None: counters = [ca.value for ca in counter_accumulators] # If the given path is an s3 path, use s3.parallelize, # otherwise just write them directly to the local dir if is_uri(args.counter_output_dir): sc.parallelize([json.dumps(counters)], numSlices=1).saveAsTextFile( args.counter_output_dir) else: # Use regular python built-in file writer if the part-* file # is not created path = os.path.join(args.counter_output_dir, "part-00000") if not os.path.exists(args.counter_output_dir): os.mkdir(args.counter_output_dir) with open(path, 'w') as wb: wb.write(str(json.dumps(counters)))
from pyspark import SparkConf, SparkContext conf = SparkConf().setAppName("spark_app_read_data_from_seqfile") sc = SparkContext(conf=conf) lineRDD = sc.hadoopFile(path="hdfs://dip.cdh5.dev:8020/user/yurun/seqfile", inputFormatClass="org.apache.hadoop.mapred.SequenceFileInputFormat", keyClass="com.sina.dip.spark.converter.IntArrayWritable", valueClass="org.apache.hadoop.io.NullWritable", keyConverter="com.sina.dip.spark.converter.IntArrayWritableToObjectArrayConverter").map(lambda pair: pair[0]) lines = lineRDD.collect() for line in lines: print line sc.stop()
def main(): conf = SparkConf().setAppName("first") sc = SparkContext(conf=conf) #print 40 * '-' sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", aws_key_id) sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", aws_key) config_dict = { "fs.s3n.awsAccessKeyId": aws_key_id, "fs.s3n.awsSecretAccessKey": aws_key } bucket = "project4capstones3" prefix = "/2017/07/*/*/*" filename = "s3n://{}/Trump/{}".format(bucket, prefix) rdd = sc.hadoopFile(filename, 'org.apache.hadoop.mapred.TextInputFormat', 'org.apache.hadoop.io.Text', 'org.apache.hadoop.io.LongWritable', conf=config_dict) spark = SparkSession.builder.appName("PythonWordCount").config( "spark.files.overwrite", "true").getOrCreate() df = spark.read.json(rdd.map(lambda x: x[1])) data_rm_na = df.filter(df["status_id"] != 'None') features_of_interest = ["rt_status_user_followers_count",\ 'rt_status_user_friends_count',\ 'rt_status_user_statuses_count',\ 'rt_status_retweet_count',\ 'rt_status_user_listed_count',\ 'rt_status_user_id',\ 'rt_status_created_at',\ 'status_created_at',\ 'rt_status_user_name',\ 'rt_status_num_user_mentions',\ 'searched_names',\ 'rt_status_sentMag',\ 'rt_status_sentScore',\ 'rt_status_favorite_count',\ 'status_id'] df_reduce = data_rm_na.select(features_of_interest) df_reduce = df_reduce.withColumn( "rt_status_user_followers_count", df_reduce["rt_status_user_followers_count"].cast(IntegerType())) df_reduce = df_reduce.withColumn( "rt_status_user_friends_count", df_reduce["rt_status_user_friends_count"].cast(IntegerType())) df_reduce = df_reduce.withColumn( "rt_status_user_statuses_count", df_reduce["rt_status_user_statuses_count"].cast(IntegerType())) df_reduce = df_reduce.withColumn( "rt_status_retweet_count", df_reduce["rt_status_retweet_count"].cast(IntegerType())) df_reduce = df_reduce.withColumn( "rt_status_user_listed_count", df_reduce["rt_status_user_listed_count"].cast(IntegerType())) df_reduce = df_reduce.withColumn( "rt_status_favorite_count", df_reduce["rt_status_favorite_count"].cast(IntegerType())) df_reduce = df_reduce.withColumn( "rt_status_num_user_mentions", df_reduce["rt_status_num_user_mentions"].cast(IntegerType())) url_ = "jdbc:mysql://twittertalker1.csjkhjjygutf.us-east-1.rds.amazonaws.com:3306/innodb" table_name_ = "retweet" mode_ = "overwrite" df_reduce.write.format("jdbc").option("url", url_)\ .option("dbtable", table_name_)\ .option("driver", "com.mysql.jdbc.Driver")\ .option("user", "XXXXXX")\ .option("password", "XXXXXXXX")\ .mode(mode_)\ .save()
"""""" # 读取sequenceFile data = sc.sequenceFile(inputFile, "org.apache.hadoop.io.Text", 'org.apache.hadoop.io.IntWritable') # 保存sequenceFile data2 = sc.parallelize([('Panda', 3), ('Kay', 6), ('Snail', 2)]) data2.saveAsSequenceFile(outputFile) # 5.2.5 对象文件 """对象文件在 Python 中无法使用,不过 Python 中的 RDD 和 SparkContext 支持 saveAsPickleFile() 和 pickleFile() 方法作为替代。这使用了 Python 的 pickle 序列化库。不过,对象文件的 注意事项同样适用于 pickle 文件: pickle 库可能很慢,并且在修改类定义后,已经生产的 数据文件可能无法再读出来""" # 5.2.6 Hadoop输入输出格式 # 1.读取其他Hadoop输入格式 input2 = sc.hadoopFile(inputFile).map(lambda x, y: (str(x), str(y))) # 2.保存Hadoop输出格式 input2.saveAsNewAPIHadoopFile(inputFile) # 3.to do # 5.3文件系统 # 5.3.1 本地文件系统 rdd = sc.textFile(inputFile) # 5.3.2 Amazon S3 # 5.3.3 HDFS:只需要将输入输出路径指定为hdfs://master:port/path即可 # 5.4 Spark SQL中的结构化数据,详见第九章 # 5.4.1Apache Hive """要把 Spark SQL 连接到已有的 Hive 上,你需要提供 Hive 的配置文件。你需要将 hive-site. xml 文件复制到 Spark 的 ./conf/ 目录下""" hiveCtx = HiveContext(sc) rows = hiveCtx.sql('SELECT name, age FROM users') firstRow = rows.first()
# sc.cancelJobGroup(groupId) # sc.setJobGroup(groupId,"") # 将概要信息转储到目录路径中 # sc.dump_profiles(path) rdd = sc.emptyRDD() # 创建一个没有分区或元素的RDD。 print(sc.getConf()) # 返回SparkConf对象 # getLocalProperty(key) # Get a local property set in this thread, or null if it is missing. See setLocalProperty(). # classmethod getOrCreate(conf=None) """ sc.hadoopFile() sc.hadoopRDD() sheet = sc.newAPIHadoopFile( '/user/me/sample.txt', 'org.apache.hadoop.mapreduce.lib.input.TextInputFormat', 'org.apache.hadoop.io.LongWritable', 'org.apache.hadoop.io.Text', conf={'textinputformat.record.delimiter': 'Time\tMHist'} ) parquet_rdd = sc.newAPIHadoopFile( path, 'org.apache.parquet.avro.AvroParquetInputFormat', 'java.lang.Void', 'org.apache.avro.generic.IndexedRecord',
def main(cmd_line_args=None): if cmd_line_args is None: cmd_line_args = sys.argv[1:] parser = _make_arg_parser() args = parser.parse_args(cmd_line_args) if args.num_reducers is not None and args.num_reducers <= 0: raise ValueError( 'You can only configure num_reducers to positive number.') # get job_class job_module_name, job_class_name = args.job_class.rsplit('.', 1) job_module = import_module(job_module_name) job_class = getattr(job_module, job_class_name) # load initial data from pyspark import SparkContext if args.job_args: job_args = shlex_split(args.job_args) else: job_args = [] # determine hadoop_*_format, steps # try to avoid instantiating a job in the driver; see #2044 job = None if args.hadoop_input_format is None: job = job or job_class(job_args) hadoop_input_format = job.hadoop_input_format() else: hadoop_input_format = args.hadoop_input_format or None if args.hadoop_output_format is None: job = job or job_class(job_args) hadoop_output_format = job.hadoop_output_format() else: hadoop_output_format = args.hadoop_output_format or None if args.sort_values is None: job = job or job_class(job_args) sort_values = job.sort_values() else: sort_values = args.sort_values if args.steps_desc is None: job = job or job_class(job_args) steps = [step.description(step_num) for step_num, step in enumerate(job.steps())] else: steps = json.loads(args.steps_desc) # pick steps start = args.first_step_num or 0 end = None if args.last_step_num is None else args.last_step_num + 1 steps_to_run = list(enumerate(steps))[start:end] sc = SparkContext() # keep track of one set of counters per job step counter_accumulators = [ sc.accumulator(defaultdict(dict), CounterAccumulator()) for _ in steps_to_run ] def make_increment_counter(step_num): counter_accumulator = counter_accumulators[step_num - start] def increment_counter(group, name, amount=1): counter_accumulator.add({group: {name: amount}}) return increment_counter def make_mrc_job(mrc, step_num): j = job_class(job_args + [ '--%s' % mrc, '--step-num=%d' % step_num ]) # patch increment_counter() to update the accumulator for this step j.increment_counter = make_increment_counter(step_num) return j try: if hadoop_input_format: rdd = sc.hadoopFile( args.input_path, inputFormatClass=hadoop_input_format, keyClass='org.apache.hadoop.io.Text', valueClass='org.apache.hadoop.io.Text') # hadoopFile loads each line as a key-value pair in which the # contents of the line are the key and the value is an empty # string. Convert to an rdd of just lines, encoded as bytes. rdd = rdd.map(lambda kv: kv[0].encode('utf-8')) else: rdd = sc.textFile(args.input_path, use_unicode=False) # run steps for step_num, step in steps_to_run: rdd = _run_step(step, step_num, rdd, make_mrc_job, args.num_reducers, sort_values) # max_output_files: limit number of partitions if args.max_output_files: rdd = rdd.coalesce(args.max_output_files) # write the results if hadoop_output_format: # saveAsHadoopFile takes an rdd of key-value pairs, so convert to # that format rdd = rdd.map(lambda line: tuple( x.decode('utf-8') for x in line.split(b'\t', 1))) rdd.saveAsHadoopFile( args.output_path, outputFormatClass=hadoop_output_format, compressionCodecClass=args.compression_codec) else: rdd.saveAsTextFile( args.output_path, compressionCodecClass=args.compression_codec) finally: if args.counter_output_dir is not None: counters = [ca.value for ca in counter_accumulators] sc.parallelize( [json.dumps(counters)], numSlices=1 ).saveAsTextFile( args.counter_output_dir )
from pyspark import SparkConf, SparkContext conf = SparkConf().setAppName("spark_app_read_data_from_seqfile") sc = SparkContext(conf=conf) lineRDD = sc.hadoopFile( path="hdfs://dip.cdh5.dev:8020/user/yurun/seqfile", inputFormatClass="org.apache.hadoop.mapred.SequenceFileInputFormat", keyClass="com.sina.dip.spark.converter.IntArrayWritable", valueClass="org.apache.hadoop.io.NullWritable", keyConverter= "com.sina.dip.spark.converter.IntArrayWritableToObjectArrayConverter").map( lambda pair: pair[0]) lines = lineRDD.collect() for line in lines: print line sc.stop()
from pyspark import SparkContext, SparkConf """ org.apache.hadoop.mapred KeyValueTextInputFormat SequenceFileAsBinaryInputFormat SequenceFileAsTextInputFormat SequenceFileInputFormat<K,V> TextInputFormat """ sparkconf = SparkConf().setAppName('hadoop io read').setMaster('local') sc = SparkContext(conf=sparkconf) f = sc.hadoopFile('hdfs://172.19.0.2/newtextfile/part-00000', 'org.apache.hadoop.mapred.TextInputFormat', 'org.apache.hadoop.io.Text', 'org.apache.hadoop.io.IntWritable') print f.collect()