def index(): input_fmt_cls_name = 'com.mongodb.hadoop.MongoInputFormat' output_fmt_cls_name = 'com.mongodb.spark.PySparkMongoOutputFormat' val_cls_name = key_cls_name = 'com.mongodb.hadoop.io.BSONWritable' val_converter = key_converter = 'com.mongodb.spark.pickle.NoopConverter' config = load_config() host, port = config.get('mongo', 'host'), config.get('mongo', 'port') dbname = config.get('mongo', 'dbname') dbpath_in = 'mongodb://{}:{}/{}.documents'.format(host, port, dbname) dbpath_out = 'mongodb://{}:{}/{}.indexes_raw'.format(host, port, dbname) sc = SparkContext('local', 'pyspark') doc_rdd_raw = sc.newAPIHadoopRDD(input_fmt_cls_name, key_cls_name, val_cls_name, None, None, {'mongo.input.uri': dbpath_in}) doc_rdd = doc_rdd_raw.values() result = doc_rdd.flatMap(index_document)#.reduceByKey(join_hits) #result.coalesce(1, True).saveAsTextFile('results') result.saveAsNewAPIHadoopFile( 'file:///placeholder', outputFormatClass=output_fmt_cls_name, keyClass=key_cls_name, valueClass=val_cls_name, keyConverter=key_converter, valueConverter=val_converter, conf={'mongo.output.uri': dbpath_out})
def main(): conf = SparkConf().setAppName("pyspark test") sc = SparkContext(conf=conf) # Create an RDD backed by the MongoDB collection. # MongoInputFormat allows us to read from a live MongoDB instance. # We could also use BSONFileInputFormat to read BSON snapshots. rdd = sc.newAPIHadoopRDD( inputFormatClass='com.mongodb.hadoop.MongoInputFormat', keyClass='org.apache.hadoop.io.Text', valueClass='org.apache.hadoop.io.MapWritable', conf={ 'mongo.input.uri': 'mongodb://localhost:27017/db.collection' } ) # Save this RDD as a Hadoop "file". # The path argument is unused; all documents will go to "mongo.output.uri". rdd.saveAsNewAPIHadoopFile( path='file:///this-is-unused', outputFormatClass='com.mongodb.hadoop.MongoOutputFormat', keyClass='org.apache.hadoop.io.Text', valueClass='org.apache.hadoop.io.MapWritable', conf={ 'mongo.output.uri': 'mongodb://localhost:27017/output.collection' } ) # We can also save this back to a BSON file. rdd.saveAsNewAPIHadoopFile( path='hdfs://localhost:8020/user/spark/bson-demo', outputFormatClass='com.mongodb.hadoop.BSONFileOutputFormat', keyClass='org.apache.hadoop.io.Text', valueClass='org.apache.hadoop.io.MapWritable' )
def get_hbase_as_rdd(host,tablename): sc = SparkContext(appName="hbase2rdd") conf = {"hbase.zookeeper.quorum": host, "hbase.mapreduce.inputtable": tablename} print "Connecting to host: " + conf["hbase.zookeeper.quorum"] + " table: " + conf["hbase.mapreduce.inputtable"] keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter" valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter" hbase_rdd = sc.newAPIHadoopRDD("org.apache.hadoop.hbase.mapreduce.TableInputFormat","org.apache.hadoop.hbase.io.ImmutableBytesWritable","org.apache.hadoop.hbase.client.Result",keyConverter=keyConv,valueConverter=valueConv,conf=conf) return hbase_rdd
def get_rdd(es_index, es_type): if es_type is "": resource = es_index else: resource = es_index + "/" + "doc" es_read_conf = { "es.nodes": ES_IP, "es.port": ES_PORT, "es.resource": resource, "es.index.read.missing.as.empty": 'yes' } conf = SparkConf().setAppName("Unfetter") sc = SparkContext(conf=conf) rdd = sc.newAPIHadoopRDD( inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat", keyClass="org.apache.hadoop.io.NullWritable", valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=es_read_conf) return rdd
def read_from_hbase(): conf = { "hbase.zookeeper.quorum": host, "hbase.mapreduce.inputtable": table } keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter" valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter" sc = SparkContext(appName="PythonReadFromHbase") hbase_rdd = sc.newAPIHadoopRDD( "org.apache.hadoop.hbase.mapreduce.TableInputFormat", "org.apache.hadoop.hbase.io.ImmutableBytesWritable", "org.apache.hadoop.hbase.client.Result", keyConverter=keyConv, valueConverter=valueConv, conf=conf) count = hbase_rdd.count() hbase_rdd.cache() output = hbase_rdd.collect() for (k, v) in output: print(k, v)
def RunProcessWithSpark(): from pyspark import SparkContext sc = SparkContext(appName="samplePysparkHbase", pyFiles=['/home/centos/pythonFile.py']) conf = {"hbase.zookeeper.quorum": "172.**.**.51", "hbase.mapreduce.inputtable": input_table} keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter" valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter" hbaseRdd = sc.newAPIHadoopRDD( "org.apache.hadoop.hbase.mapreduce.TableInputFormat", "org.apache.hadoop.hbase.io.ImmutableBytesWritable", "org.apache.hadoop.hbase.client.Result", keyConverter=keyConv, valueConverter=valueConv, conf=conf) messageRdd = hbaseRdd.map(lambda x: x[1]) # message_rdd = hbase_rdd.map(lambda x:x[0]) will give only row-key filterRdd = rdd.filter(lambda x : filter_rows(x)) # Filter Hbase data if required processedRdd = filterRdd.mapPartitions(lambda x: process_data(x)) # Process data processedRdd = processedRdd.flatMap(lambda x: x) save_record(processedRdd) # Save process data to hbase
def run_driver(keyspace): sc = SparkContext(appName="PySpark Cassandra Hadoop Example") # Reading from Cassandra conf = { "cassandra.input.thrift.address": "localhost", "cassandra.input.thrift.port": "9160", "cassandra.input.keyspace": keyspace, "cassandra.input.columnfamily": "users", "cassandra.input.partitioner.class": "Murmur3Partitioner", "cassandra.input.page.row.size": "5000" } cass_rdd = sc.newAPIHadoopRDD( # inputFormatClass "org.apache.cassandra.hadoop.cql3.CqlInputFormat", # keyClass "java.util.Map", # valueClass "java.util.Map", keyConverter=INPUT_KEY_CONVERTER, valueConverter=INPUT_VALUE_CONVERTER, conf=conf) print cass_rdd.collect() # Writing to Cassandra now = dt.datetime.now() users = ( { "id": "keith", "created_at": now, "updated_at": now, "first_name": "Keith", "last_name": "Bourgoin", "emails": set(["*****@*****.**"]), "logins": [now.isoformat()], "settings": { "background_color": "blue", }, }, { "id": "toms", "created_at": now, "updated_at": now, "first_name": "Toms", "last_name": "Baugis", "emails": set(["*****@*****.**"]), "logins": [now.isoformat()], "settings": { "background_color": "green", }, }, ) cql = """ UPDATE users SET created_at=?, updated_at=?, first_name=?, last_name=?, emails=?, logins=?, settings=? """.strip() conf = { "cassandra.output.thrift.address": "localhost", "cassandra.output.thrift.port": "9160", "cassandra.output.keyspace": keyspace, "cassandra.output.partitioner.class": "Murmur3Partitioner", "cassandra.output.cql": cql, "mapreduce.output.basename": "users", "mapreduce.outputformat.class": "org.apache.cassandra.hadoop.cql3.CqlOutputFormat", "mapreduce.job.output.key.class": "java.util.Map", "mapreduce.job.output.value.class": "java.util.List" } users = sc.parallelize(users) users.map(to_cql_output_format)\ .saveAsNewAPIHadoopDataset(conf=conf, keyConverter=OUTPUT_KEY_CONVERTER, valueConverter=OUTPUT_VALUE_CONVERTER) sc.stop()
to run. """ from pyspark import SparkContext def formatFollowerRecord(record): rank, (name, followers) = record name = name.encode('ascii', 'ignore') return "{0}: {1}, {2}".format(rank + 1, name, followers) if __name__ == "__main__": sc = SparkContext() config = {"mongo.input.uri": "mongodb://localhost:27017/twitter.no"} inputFormatClassName = "com.mongodb.hadoop.MongoInputFormat" keyClassName = "org.apache.hadoop.io.Text" valueClassName = "org.apache.hadoop.io.MapWritable" RawRDD = sc.newAPIHadoopRDD(inputFormatClassName, keyClassName, valueClassName, None, None, config) mostFollowers = RawRDD.values()\ .map(lambda x: (x['user']['name'], x['user']['followers_count']))\ .foldByKey(0, max)\ .sortBy(lambda x: -x[1])\ .take(20) with open("most_followed.txt", "w") as of: of.write("\n".join( formatFollowerRecord(r) for r in enumerate(mostFollowers)))
# -- # Connections client = Elasticsearch([{ 'host' : config["es"]["host"], 'port' : config["es"]["port"] }], timeout = 60000) rdd = sc.newAPIHadoopRDD( inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat", keyClass="org.apache.hadoop.io.NullWritable", valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", conf={ "es.nodes" : config['es']['host'], "es.port" : str(config['es']['port']), "es.resource" : "%s/%s" % (config['edgar_index']['index'], config['edgar_index']['_type']), "es.query" : json.dumps(query) } ) # -- # Function definitions def get_id(x): return sha1('__'.join(map(str, x[0]))).hexdigest() def merge_dates(x, min_dates): id_ = get_id(x) if min_dates.get(id_, False):
sc = SparkContext(conf=conf) # "mapreduce.input.fileinputformat.split.minsize.per.node": "67108864" # "mapreduce.input.fileinputformat.split.minsize.per.rack": "134217728" hadoopConf = { "mapreduce.input.fileinputformat.inputdir": "/user/hdfs/rawlog/app_weibomobilekafka1234_topweiboaction/", "mapreduce.input.fileinputformat.input.dir.recursive": "true" } # TextInputFormat + coalesce # CombineTextInputFormat source = sc.newAPIHadoopRDD( inputFormatClass="org.apache.hadoop.mapreduce.lib.input.TextInputFormat", keyClass="org.apache.hadoop.io.LongWritable", valueClass="org.apache.hadoop.io.Text", conf=hadoopConf) source = source.coalesce(5000) lines = source.map(lambda pair: pair[1]) words = lines.flatMap(lambda line: line.split(",")) pairs = words.map(lambda word: (word[0:10], 1)) counts = pairs.reduceByKey(lambda a, b: a + b, 30) counts.saveAsTextFile("/user/yurun/spark/output/wordcount/")
from pyspark import SparkContext, SparkConf import sys #spark-submit /vagrant/spark.py <tablename> <tmp-dir> if __name__ == "__main__": conf = SparkConf().setAppName('tweets spark aggregation') sc = SparkContext(conf=conf) print 'reading %s'%sys.argv[1] conf = {"hbase.zookeeper.quorum": "localhost", "hbase.mapreduce.inputtable": sys.argv[1]} keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter" valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter" hbase_rdd = sc.newAPIHadoopRDD("org.apache.hadoop.hbase.mapreduce.TableInputFormat","org.apache.hadoop.hbase.io.ImmutableBytesWritable","org.apache.hadoop.hbase.client.Result",keyConverter=keyConv, valueConverter=valueConv, conf=conf) #following line doesn't work...probably due to https://issues.apache.org/jira/browse/SPARK-5361 #hbase_rdd.saveAsHadoopFile("/usr/out.txt", outputFormatClass="org.apache.hadoop.mapred.TextOutputFormat", keyClass="org.apache.hadoop.io.Text", valueClass="org.apache.hadoop.io.Text") print 'writing to %s'%sys.argv[2] hbase_rdd.filter(lambda (x,y): len(y)>0).map(lambda (_,y):y).saveAsTextFile(sys.argv[2])
rkeyConv = "hbase.pythonconverters.ImmutableBytesWritableToStringConverter" rvalueConv = "hbase.pythonconverters.HBaseResultToStringConverter" host = '192.168.10.24' hbase_table = 'public_sentiment' rconf = { "hbase.zookeeper.quorum": host, "hbase.mapreduce.inputtable": hbase_table, # 03代表微信数据 "hbase.mapreduce.scan.row.start": u'嫘祖杯030000', "hbase.mapreduce.scan.row.stop": u'嫘祖杯039999', } ps_data = sc.newAPIHadoopRDD( "org.apache.hadoop.hbase.mapreduce.TableInputFormat", "org.apache.hadoop.hbase.io.ImmutableBytesWritable", "org.apache.hadoop.hbase.client.Result", keyConverter=rkeyConv, valueConverter=rvalueConv, conf=rconf) real_data = ps_data.map(lambda x: transformFromHbase(x[1]))\ .map(lambda x: x.get('d:title', None)) print real_data.collect() data = sc.parallelize(['the data of word', 'another data']) result = real_data.flatMap(lambda x: cutWord(x, stopwords))\ .map(lambda x: (x, 1))\ .reduceByKey(lambda x, y: x + y) print result.collect() sc.stop()
""" exit(-1) host = sys.argv[1] keyspace = sys.argv[2] cf = sys.argv[3] sc = SparkContext(appName="CassandraInputFormat") conf = { "cassandra.input.thrift.address": host, "cassandra.input.thrift.port": "9160", "cassandra.input.keyspace": keyspace, "cassandra.input.columnfamily": cf, "cassandra.input.partitioner.class": "Murmur3Partitioner", "cassandra.input.page.row.size": "3" } cass_rdd = sc.newAPIHadoopRDD( "org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat", "java.util.Map", "java.util.Map", keyConverter= "org.apache.spark.examples.pythonconverters.CassandraCQLKeyConverter", valueConverter= "org.apache.spark.examples.pythonconverters.CassandraCQLValueConverter", conf=conf) output = cass_rdd.collect() for (k, v) in output: print(k, v) sc.stop()
conf = SparkConf().setAppName('Getting utility Data').setMaster("local") sc = SparkContext() read_conf = { 'es.nodes': es_server, 'es.port': es_port, 'es.resource': 'patents/patent', 'es.query': '{ "query" : { "match_all" : {} }}', 'es.scroll.keepalive': '120m', 'es.scroll.size': '1000', 'es.http.timeout': '20m' } data = sc.newAPIHadoopRDD( inputFormatClass = 'org.elasticsearch.hadoop.mr.EsInputFormat', keyClass = 'org.apache.hadoop.io.NullWritable', valueClass = 'org.elasticsearch.hadoop.mr.LinkedMapWritable', conf = read_conf ) def compare_classifications(x,y): len_comp = cmp(len(x), len(y)) if len_comp == 0: return cmp(x,y) return len_comp def get_classes(ipc_classification): sections = [] classes = [] subclasses = []
Run with example jar: ./bin/spark-submit --driver-class-path /path/to/example/jar /path/to/examples/cassandra_inputformat.py <host> <keyspace> <cf> Assumes you have some data in Cassandra already, running on <host>, in <keyspace> and <cf> """ exit(-1) host = sys.argv[1] keyspace = sys.argv[2] cf = sys.argv[3] sc = SparkContext(appName="CassandraInputFormat") conf = {"cassandra.input.thrift.address":host, "cassandra.input.thrift.port":"9160", "cassandra.input.keyspace":keyspace, "cassandra.input.columnfamily":cf, "cassandra.input.partitioner.class":"Murmur3Partitioner", "cassandra.input.page.row.size":"3"} cass_rdd = sc.newAPIHadoopRDD( "org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat", "java.util.Map", "java.util.Map", keyConverter="org.apache.spark.examples.pythonconverters.CassandraCQLKeyConverter", valueConverter="org.apache.spark.examples.pythonconverters.CassandraCQLValueConverter", conf=conf) output = cass_rdd.collect() for (k, v) in output: print (k, v) sc.stop()
Usage: hbase_inputformat <host> <table> Run with example jar: ./bin/spark-submit --driver-class-path /path/to/example/jar /path/to/examples/hbase_inputformat.py <host> <table> Assumes you have some data in HBase already, running on <host>, in <table> """ exit(-1) host = sys.argv[1] table = sys.argv[2] sc = SparkContext(appName="HBaseInputFormat") conf = { "hbase.zookeeper.quorum": host, "hbase.mapreduce.inputtable": table } hbase_rdd = sc.newAPIHadoopRDD( "org.apache.hadoop.hbase.mapreduce.TableInputFormat", "org.apache.hadoop.hbase.io.ImmutableBytesWritable", "org.apache.hadoop.hbase.client.Result", keyConverter= "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter", valueConverter= "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter", conf=conf) output = hbase_rdd.collect() for (k, v) in output: print(k, v) sc.stop()
# spark_session = SparkSession(spark_context) hbase_host = "localhost" hbase_port = "2181" hbase_table = "TEST_RPT" hbase_conf = { "hbase.zookeeper.quorum": hbase_host, "hbase.zookeeper.property.clientPort": hbase_port, "hbase.mapreduce.inputtable": hbase_table, "hbase.mapreduce.scan.row.start": '18589801', "hbase.mapreduce.scan.row.stop": '18589802', "hbase.mapreduce.scan.columns": "info:cont info:bd info:sex" } keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter" valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter" hbase_rdd = spark_context.newAPIHadoopRDD( "org.apache.hadoop.hbase.mapreduce.TableInputFormat", "org.apache.hadoop.hbase.io.ImmutableBytesWritable", "org.apache.hadoop.hbase.client.Result", keyConverter=keyConv, valueConverter=valueConv, conf=hbase_conf) count = hbase_rdd.count() hbase_rdd.cache() output = hbase_rdd.collect() for (k, v) in output[:1]: print(k, v)
class SearchAdapter: def __init__(self, elastic_config, es): self.es = es self.index = elastic_config['index'] findspark.add_jars(elastic_config['es_hadoop_jar']) findspark.init() from pyspark import SparkContext self.sc = SparkContext(appName="adbi-top-k") def to_elasticsearch_query(self, query: Query): clauses = [] logger.info('Converting query %s' % vars(query)) if query.term: clauses.append({"match": {"text": query.term}}) if query.place: clauses.append({"match": {"place": query.place}}) if query.start_time and query.end_time: clauses.append({ "range": { "created_at": { "gte": query.start_time, "lte": query.end_time } } }) return {"query": {"bool": {"must": clauses}}} def find(self, query: Query): q_str = self.to_elasticsearch_query(query) print(q_str) es_read_conf = { "es.resource": "{0}/tweet".format(self.index), "es.query": json.dumps(q_str) } es_rdd = self.sc.newAPIHadoopRDD( inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat", keyClass="org.apache.hadoop.io.NullWritable", valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=es_read_conf) top_k = [] if es_rdd.isEmpty(): return top_k tweets = es_rdd.map(lambda doc: doc[1]['text']) tweets = tweets.map(bloom_filter) words_df = tweets.flatMap(lambda line: line) # build a count min sketch table for every partition and combine them into one CMS cms_merged = words_df.aggregate(CountMinSketch(22000, 200), lambda cms, word: cms.add(word), lambda cms1, cms2: cms1.merge(cms2)) words = words_df.distinct().collect() queue = PriorityQueue() for word in words: queue = topK(word, cms_merged, queue, query.k) while not queue.empty(): top = queue.get() top_k.append({'word': top.value, 'count': top.count}) return top_k
def run_driver(keyspace): sc = SparkContext(appName="PySpark Cassandra Hadoop Example") # Reading from Cassandra conf = { "cassandra.input.thrift.address": "localhost", "cassandra.input.thrift.port": "9160", "cassandra.input.keyspace": keyspace, "cassandra.input.columnfamily": "users", "cassandra.input.partitioner.class":"Murmur3Partitioner", "cassandra.input.page.row.size": "5000" } cass_rdd = sc.newAPIHadoopRDD( # inputFormatClass "org.apache.cassandra.hadoop.cql3.CqlInputFormat", # keyClass "java.util.Map", # valueClass "java.util.Map", keyConverter=INPUT_KEY_CONVERTER, valueConverter=INPUT_VALUE_CONVERTER, conf=conf) print cass_rdd.collect() # Writing to Cassandra now = dt.datetime.now() users = ( { "id": "keith", "created_at": now, "updated_at": now, "first_name": "Keith", "last_name": "Bourgoin", "emails": set(["*****@*****.**"]), "logins": [now.isoformat()], "settings": { "background_color": "blue", }, }, { "id": "toms", "created_at": now, "updated_at": now, "first_name": "Toms", "last_name": "Baugis", "emails": set(["*****@*****.**"]), "logins": [now.isoformat()], "settings": { "background_color": "green", }, }, ) cql = """ UPDATE users SET created_at=?, updated_at=?, first_name=?, last_name=?, emails=?, logins=?, settings=? """.strip() conf = { "cassandra.output.thrift.address": "localhost", "cassandra.output.thrift.port": "9160", "cassandra.output.keyspace": keyspace, "cassandra.output.partitioner.class": "Murmur3Partitioner", "cassandra.output.cql": cql, "mapreduce.output.basename": "users", "mapreduce.outputformat.class": "org.apache.cassandra.hadoop.cql3.CqlOutputFormat", "mapreduce.job.output.key.class": "java.util.Map", "mapreduce.job.output.value.class": "java.util.List" } users = sc.parallelize(users) users.map(to_cql_output_format)\ .saveAsNewAPIHadoopDataset(conf=conf, keyConverter=OUTPUT_KEY_CONVERTER, valueConverter=OUTPUT_VALUE_CONVERTER) sc.stop()
sc = SparkContext("local[*]", "Dynamic_Resource_Alloc") # , batchSize=5000) # sc = SparkContext("local[*]", "Spark_Kafka_Streaming_App") # , batchSize=5000) # streaming object ssc = StreamingContext(sc, 10) # dataframe object sqlContext = SQLContext(sc) es_rdd = sc.newAPIHadoopRDD( inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat", keyClass="org.apache.hadoop.io.NullWritable", valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", conf={ "es.resource": "filebeat-2018.10.01/log", 'es.index.auto.create': 'true', 'es.index.read.missing.as.empty': 'true', "es.nodes": 'cldmaster.local', "es.port": '9200' }) # pprint.pprint(es_rdd.first()[0]) # es_rdd.saveAsTextFile("/user/cloudera/ELK_log") es_rdd.foreach() # # # Kafka Consumer client, connect to Kafka producer server # # kvs = KafkaUtils.createStream(ssc, 'cldmaster.local:2181', 'spark-streaming-consumer', {topic1: 1, topic2: 1}, # # keyDecoder=iso_8859_1, valueDecoder=iso_8859_1)
row2 column=f1:, timestamp=1401883415212, value=value2 row3 column=f1:, timestamp=1401883417858, value=value3 row4 column=f1:, timestamp=1401883420805, value=value4 4 row(s) in 0.0240 seconds """ if __name__ == "__main__": if len(sys.argv) != 3: print >> sys.stderr, """ Usage: hbase_inputformat <host> <table> Run with example jar: ./bin/spark-submit --driver-class-path /path/to/example/jar /path/to/examples/hbase_inputformat.py <host> <table> Assumes you have some data in HBase already, running on <host>, in <table> """ exit(-1) host = sys.argv[1] table = sys.argv[2] sc = SparkContext(appName="HBaseInputFormat") conf = {"hbase.zookeeper.quorum": host, "hbase.mapreduce.inputtable": table} hbase_rdd = sc.newAPIHadoopRDD( "org.apache.hadoop.hbase.mapreduce.TableInputFormat", "org.apache.hadoop.hbase.io.ImmutableBytesWritable", "org.apache.hadoop.hbase.client.Result", valueConverter="org.apache.spark.examples.pythonconverters.HBaseConverter", conf=conf) output = hbase_rdd.collect() for (k, v) in output: print (k, v)
# credit http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i+n] # set up context sc = SparkContext("local[*]", "Simple App") #sc = SparkContext("spark://url:7077", "Simple App") sqlContext = SQLContext(sc) sqlContext.setConf("spark.sql.shuffle.partitions", "5") # issue movies query conf = {"es.resource" : "movies2/logs", "es.query" : "?q=name:picture"} movies = sc.newAPIHadoopRDD("org.elasticsearch.hadoop.mr.EsInputFormat",\ "org.apache.hadoop.io.NullWritable", "org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=conf) # place results in table moviesRows = movies.map(lambda p: Row(id=int(p[1]['id']), name=p[1]['name'])) moviesRowsList = moviesRows.collect() schemaMovies = sqlContext.createDataFrame(moviesRowsList) schemaMovies.registerTempTable("movies") sqlContext.cacheTable("movies") # get ids in order to form acted_in query ids = [] for moviesRow in moviesRowsList: ids.append(moviesRow['id']) movieIdSnippets = [] for id in ids: movieIdSnippets.append("movie_id:" + str(id))
from pyspark import SparkContext # Spark Streaming from pyspark.streaming import StreamingContext # Kafka from pyspark.streaming.kafka import KafkaUtils # json parsing import json sc = SparkContext(appName="json") sc.setLogLevel("WARN") hostName = 'localhost' tableName = 'pysparkBookTable' ourInputFormatClass = 'org.apache.hadoop.hbase.mapreduce.TableInputFormat' ourKeyClass = 'org.apache.hadoop.hbase.io.ImmutableBytesWritable' ourValueClass = 'org.apache.hadoop.hbase.client.Result' ourKeyConverter = 'org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter' ourValueConverter = 'org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter' configuration = {} configuration['hbase.mapreduce.inputtable'] = tableName configuration['hbase.zookeeper.quorum'] = hostName tableRDDfromHBase = sc.newAPIHadoopRDD(inputFormatClass=ourInputFormatClass, keyClass=ourKeyClass, valueClass=ourValueClass, keyConverter=ourKeyConverter, valueConverter=ourValueConverter, conf=configuration) tableRDDfromHBase.take(2)
conf = SparkConf() conf.setAppName("spark_app_wordcount_merge") sc = SparkContext(conf=conf) # "mapreduce.input.fileinputformat.split.minsize.per.node": "67108864" # "mapreduce.input.fileinputformat.split.minsize.per.rack": "134217728" hadoopConf = {"mapreduce.input.fileinputformat.inputdir": "/user/hdfs/rawlog/app_weibomobilekafka1234_topweiboaction/", "mapreduce.input.fileinputformat.input.dir.recursive": "true"} # TextInputFormat + coalesce # CombineTextInputFormat source = sc.newAPIHadoopRDD(inputFormatClass="org.apache.hadoop.mapreduce.lib.input.TextInputFormat", keyClass="org.apache.hadoop.io.LongWritable", valueClass="org.apache.hadoop.io.Text", conf=hadoopConf) source = source.coalesce(5000) lines = source.map(lambda pair: pair[1]) words = lines.flatMap(lambda line: line.split(",")) pairs = words.map(lambda word: (word[0:10], 1)) counts = pairs.reduceByKey(lambda a, b: a + b, 30) counts.saveAsTextFile("/user/yurun/spark/output/wordcount/") sc.stop()
#sc = SparkContext("spark://quickstart.cloudera:7077", "Test MongoDB Connector") sc = SparkContext("local", "Test MongoDB Connector") # Config MongoDB inputConfig = { "mongo.input.uri" : "mongodb://localhost:27017/marketdata.stock_prices" } outputConfig = { "mongo.output.uri" : "mongodb://localhost:27017/marketdata.maxminprices" } # Config pour RDD qui va lire les data dans MongoDB inputFormatClassName = "com.mongodb.hadoop.MongoInputFormat" keyClassName = "java.lang.Object" valueClassName = "org.bson.BSONObject" stockPricesRDD = sc.newAPIHadoopRDD(inputFormatClassName, keyClassName, valueClassName, None, None, inputConfig) # Config pour RDD qui va ecrire dans MongoDB outputFormatClassName = "com.mongodb.hadoop.MongoOutputFormat" # Les traitements... # ... sur l'ensemble des data prices = stockPricesRDD.values() # ... groupby sur (symbol, day) groupByRDD = prices.groupBy(lambda doc: (doc["Symbol"], doc["Day"])) # .map(lambda tuple: (tuple[0], tuple[1])) \ # .collect() # ... aggregate par clef (on prend le max de High et le min de Low) def maxMin(doc, groupedDocs):
'Rank': f + 1, 'Feature': train_df.columns.values[f], 'Rating': importances[indices[f]] }) return rf, probabilities, feature_idx # ## Generic Model # # The following cells use a generic function to train and run a random forest. #get data for predicting future probabilities predictions = sc.newAPIHadoopRDD( inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat", keyClass="org.apache.hadoop.io.NullWritable", valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=pred_read_conf) predictions.cache() #empty feature importance RDD featuresRDD = sc.parallelize([]) # ### All Vicitms #get set of accident and non-accident records #defines an ES query for accidents with injury or fatality q = ''' { "query": {
import datetime from pyspark.sql import functions as F if __name__ == '__main__': start = datetime.datetime.now() print("开始时间:", start) sparkConf = SparkConf() \ .setAppName('demo1') sc = SparkContext(conf=sparkConf) sc.setLogLevel('WARN') sqlContext = SQLContext(sc) # 读取hbase数据 hbase_rdd = sc.newAPIHadoopRDD(HBaseUtil.inputFormatClass, HBaseUtil.keyClass, HBaseUtil.valueClass, keyConverter=HBaseUtil.keyConv, valueConverter=HBaseUtil.valueConv, conf=HBaseUtil.conf) values = hbase_rdd.values() init_rdd = values.flatMap(lambda x: x.split("\n")).map(lambda x: json.loads(x)) \ .map(lambda x: dp.dict_del(x)) data_frame = sqlContext.read.json(init_rdd) # data_frame.show() # data_frame.printSchema() result = data_frame.groupBy('qualifier').agg( F.min(data_frame.value), F.max(data_frame.value), F.avg(data_frame.value), F.sum(data_frame.value), F.count(data_frame.value)).collect() for x in result: print(x)
def quiet_logs( sc ): logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR ) logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR ) sc = SparkContext("local", "Course Stats") quiet_logs(sc) config = {"mongo.input.uri": "mongodb://192.168.99.100:27017/wsl.learning_units"} inputFormatClassName = "com.mongodb.hadoop.MongoInputFormat" # these values worked but others might as well keyClassName = "org.apache.hadoop.io.Text" valueClassName = "org.apache.hadoop.io.MapWritable" statsRawRDD = sc.newAPIHadoopRDD(inputFormatClassName, keyClassName, valueClassName, None, None, config) #valuesRDD = statsRawRDD.values(); #resultRDD = valuesRDD.map(lambda doc: str(doc["type"])) # configuration for output to MongoDB config["mongo.output.uri"] = "mongodb://192.168.99.100:27017/wsl.stats" outputFormatClassName = "com.mongodb.hadoop.MongoOutputFormat" num = statsRawRDD.count() # doesn't yet work - certain Java types like HashMap, String, GregorianCalendar # don't serialize 1.5 of the handoop mongo connector may fix this #statsRawRDD.saveAsNewAPIHadoopFile("file:///placeholder", outputFormatClassName, None, None, None, None, config) print "Count: ", num
def inverse_ref(url_words): url = url_words[0] words = re.split("[^a-z]+",url_words[1].lower()) for word in words: stemmed_word = stem(word.decode('utf-8')) if (is_word_ok(stemmed_word)): yield {stemmed_word:url} hbaseConfig={"hbase.mapreduce.inputtable":"wiki","hbase.mapreduce.scan.columns":"cf:content"} table=sc.newAPIHadoopRDD( 'org.apache.hadoop.hbase.mapreduce.TableInputFormat', 'org.apache.hadoop.hbase.io.ImmutableBytesWritable', 'org.apache.hadoop.hbase.client.Result', keyConverter="org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter", valueConverter="org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter", conf=hbaseConfig) words_occ=table.map(lambda l:inverse_ref(l)).filter(lambda l:l).map(lambda l:(l,1)) words=words_occ.map(lambda l:(l,1)).reduceByKey(lambda a,b:a+b).filter(lambda (a,b):b>100) word_counts=dict(words.collectAsMap()) for word in word_counts:
# print("Usage: kmeans <file> <k>", file=sys.stderr) # exit(-1) # if len(sys.argv) != 2: # print("Usage: kmeans <file>", file=sys.stderr) # exit(-1) conf = SparkConf() sc = SparkContext(appName="cluster-home-devices") #input_date = '2016-07-01' input_date = sys.argv[1] database_url = 'mongodb://127.0.0.1:27017/home_automation.device_stats' rdd = sc.newAPIHadoopRDD( inputFormatClass='com.mongodb.hadoop.MongoInputFormat', keyClass='org.apache.hadoop.io.Text', valueClass='org.apache.hadoop.io.MapWritable', conf={ 'mongo.input.uri': database_url, 'mongo.input.split.create_input_splits': 'false' }) #conf={'mongo.input.uri': database_url}) #rdd.cache() rdd.persist(pyspark.StorageLevel.DISK_ONLY) print("Hadoop RDD" + str(rdd.take(1))) rdd_for_day = rdd.filter(lambda s: input_date in s[1]['timestamp_hour']) rdd_for_day.persist(pyspark.StorageLevel.DISK_ONLY ) # needed as mongodb connector has bugs. #import rpdb; rpdb.set_trace() print("Hadoop RDD" + str(rdd_for_day.take(1))) #print("Hadoop RDD" + str(rdd_for_day.collect())) #exit(-1) rdd1 = rdd_for_day.map(get_device_stats_mongo_doc)
es.update(index='spark-jobs', doc_type='job', id=task_id, body={ 'doc': { 'current': 1, 'status': 'Spark job started..' } }) result_indices = len(es.indices.get_aliases(index="titanic-results-*")) output_resource = "titanic-results-%s/value-counts" % (result_indices + 1) conf = SparkConf().setAppName("ESTest") sc = SparkContext(conf=conf) es_rdd = sc.newAPIHadoopRDD( inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat", keyClass="org.apache.hadoop.io.NullWritable", valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", conf={ "es.resource" : "titanic/passenger" }) doc = es_rdd.first()[1] num_fields = len(doc) for idx, field in enumerate(doc): es.update(index='spark-jobs', doc_type='job', id=task_id, body={ 'doc': { 'current': (idx+1) * 95 / num_fields, 'status': 'Spark job underway..' } })
#pattern = r"((([01]?[0-9]?[0-9]|2[0-4][0-9]|25[0-5])[ (\[]?(\.|dot)[ )\]]?){3}([01]?[0-9]?[0-9]|2[0-4][0-9]|25[0-5]))" pattern = r"((?:[0-9]{1,3}\.){3}[0-9]{1,3})" if __name__ == "__main__": #string confstring = { 'es.nodes' : 'elasticsearch', 'es.port' : '9200', 'es.resource' : 'graylog2_0/message', 'es.query' : '""', "fields" : [ "message" ] } }}' } conf = SparkConf().setAppName("ESTest") sc = SparkContext(conf=conf) es_read_conf = { 'es.nodes' : 'elasticsearch', 'es.port' : '9200', 'es.resource' : 'graylog2_*/message', 'es.query' : '{"query": { "multi_match" : { "query" : ' ', "fields" : [ "message" ] } }}' } es_read_conf['es.query'] = custom_query es_rdd = sc.newAPIHadoopRDD( inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat", keyClass="org.apache.hadoop.io.NullWritable", valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=es_read_conf) es_write_conf = { 'es.nodes' : 'elasticsearch', 'es.port' : '9200', 'es.resource' : 'spark_analytics/severe_analytics' } es_write_conf_ip = { 'es.nodes' : 'elasticsearch', 'es.port' : '9200', 'es.resource' : 'spark_analytics/analytics' } doc = es_rdd.first()[1] # print es_rdd.collect() # exit()
} } else: print 'must chose one option [--from-scratch; --last-week]' # -- # Connecting to ES resource_string = 'forms/3,4' rdd = sc.newAPIHadoopRDD( inputFormatClass = "org.elasticsearch.hadoop.mr.EsInputFormat", keyClass = "org.apache.hadoop.io.NullWritable", valueClass = "org.elasticsearch.hadoop.mr.LinkedMapWritable", conf = { "es.nodes" : config['elasticsearch']['host'], "es.port" : str(config['elasticseach']['port']), "es.resource" : resource_string, "es.query" : json.dumps(query) } ) # -- # Function definition def clean_logical(x): if str(x).lower() == 'true': return 1 if str(x).lower() == 'false': return 0 else:
keyClass = 'org.apache.hadoop.hbase.io.ImmutableBytesWritable' valueClass = 'org.apache.hadoop.hbase.client.Result' conf = { 'hbase.zookeeper.quorum': host, 'hbase.mapreduce.inputtable': table, # 'hbase.mapreduce.scan.column.family': 'info', # 'hbase.mapreduce.scan.columns': 'info:data01', # 'hbase.mapreduce.scan.row.start': 'ss0001', # 'hbase.mapreduce.scan.row.stop': 'ss0010' } keyConv = 'org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter' valueConv = 'org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter' hbase_rdd = sc.newAPIHadoopRDD(inputFormatClass, keyClass, valueClass, keyConverter=keyConv, valueConverter=valueConv, conf=conf) # values__take = hbase_rdd.values().saveAsTextFile('hdfs://master:9000/work/data/output01') # print(values__take) # for x in values__take: # print(x) end = datetime.datetime.now() print('----------------time:', end) print('-----newAPIHadoopRDD------- time :', end - start) values = hbase_rdd.values() n_map = values.flatMap(lambda x: x.split("\n")).map( lambda x: json.loads(x)) # n_map = values.flatMap(lambda x: x.split("\n")).foreach(lambda x: print(x)) data_frame = sqlContext.read.json(n_map)
#!/usr/bin/env python # -*- coding: utf-8 -*- import logging from operator import add from pyspark import SparkContext from pyspark.streaming import StreamingContext __author__ = 'guotengfei' __time__ = 2019 / 11 / 26 """ Module comment """ LOGGER = logging.getLogger(__name__) if __name__ == '__main__': sc = SparkContext('local', appName="PythonWordCount") sc.newAPIHadoopRDD()
es_read_conf = { "es.nodes": "159.203.184.137", "es.port": "9200", "es.resource": "titanic/passenger" } es_write_conf = { "es.nodes": "159.203.184.137", "es.port": "9200", "es.resource": "titanic/value_counts" } es_rdd = sc.newAPIHadoopRDD( inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat", keyClass="org.apache.hadoop.io.NullWritable", valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=es_read_conf) doc = es_rdd.first()[1] for field in doc: value_counts = es_rdd.map(lambda item: item[1][field]) value_counts = value_counts.map(lambda word: (word, 1)) value_counts = value_counts.reduceByKey(lambda a, b: a + b) value_counts = value_counts.filter(lambda item: item[1] > 1) value_counts = value_counts.map(lambda item: ('key', { 'field': field, 'val': item[0], 'count': item[1]
doc_type='job', id=task_id, body={'doc': { 'current': 1, 'status': 'Spark job started..' }}) result_indices = len(es.indices.get_aliases(index="titanic-results-*")) output_resource = "titanic-results-%s/value-counts" % (result_indices + 1) conf = SparkConf().setAppName("ESTest") sc = SparkContext(conf=conf) es_rdd = sc.newAPIHadoopRDD( inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat", keyClass="org.apache.hadoop.io.NullWritable", valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", conf={"es.resource": "titanic/passenger"}) doc = es_rdd.first()[1] num_fields = len(doc) for idx, field in enumerate(doc): es.update(index='spark-jobs', doc_type='job', id=task_id, body={ 'doc': { 'current': (idx + 1) * 95 / num_fields,
es_query = jparse.parse(json.loads(args[2])) if not es_query: sc.stop() sys.exit(1) conf = { "es.net.http.auth.user": es_config['es_user'], "es.net.http.auth.pass": es_config['es_pass'], "es.resource": es_config['es_resource'], "es.nodes": es_config['es_nodes'], "es.serializer": "org.apache.spark.serializer.KryoSerializer", "es.query": '%s' % es_query } rdd = sc.newAPIHadoopRDD( "org.elasticsearch.hadoop.mr.EsInputFormat", "org.apache.hadoop.io.NullWritable", "org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=conf) def list_data(x): return [x[0], x[1]['person']['feature']['face']] jparse.feature = jparse.feature.replace(' ', '') bro_des_feat = sc.broadcast(jparse.feature) threshold = es_config['threshold'] def feature_calc(x): lib_handle = ctypes.CDLL('./libfeatureprocess.so') func = lib_handle.feature_process func.argtypes = [ctypes.c_char_p, ctypes.c_char_p] func.restype = ctypes.c_float
class OperationHbase(object): def __init__(self): self.sc = SparkContext('local', 'test') def get_hbase_data(self, tablename): table = str(tablename) host = 'localhost' conf = { "hasbe.zookeeper.quorum": host, "hbase.mapreduce.inputtable": table } keyConv = 'org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter' valueConv = 'org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter' hbase_rdd = self.sc.newAPIHadoopRDD( "org.apache.hadoop.hbase.mapreduce.TableInputFormat", "org.apache.hadoop.hbase.io.ImmutableBytesWritable", "org.apache.hadoop.hbase.client.Result", keyConverter=keyConv, valueConverter=valueConv, conf=conf) count = hbase_rdd.count() hbase_rdd.cache() output = hbase_rdd.collect() for k, v in output: print(k, v) def insert_hbase(self, tablename, datalist): table = str(tablename) host = 'localhost' keyConv = 'org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter' valueConv = 'org.apache.spark.examples.pythonconverters.StringListToPutConverter' conf = { "hbase.zookeeper.quorum": host, "hbase.mapred.outputtable": table, "mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat", "mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable", "mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable" } self.sc.parallelize(datalist).\ map(lambda p:(p[0],p.split('ê ē')))\ .saveAsNewAPIHadoopDataset( conf=conf,keyConverter=keyConv,valueConverter=valueConv ) def csv_to_tuple(self, csvpath): csv_df = pd.read_csv(str(csvpath)) func = lambda i: csv_df.loc[i] info_list = [func(i) for i in range(csv_df.shape[0])] insert_list = [] for i in info_list: # ele_list = [] # ele_list.append(int(i['Unnamed: 0'])) # ele_list.append('work') # ele_list.append('desp') # ele_list.append(str(i['work_desp'])) # ele_tuple=(i['Unnamed: 0'],ele_list) ele = str(i['Unnamed: 0'] ) + 'ê ē' + 'work' + 'ê ē' + 'desp' + 'ê ē' + str( i['work_desp']) insert_list.append(ele) print('csv文件转换完成') return insert_list