def start():
    sconf = SparkConf()
    sconf.set('spark.cores.max', 2)
    sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
    ssc = StreamingContext(sc, 2)

    brokers = "192.192.0.27:9092"
    topics = ['topic7']

    kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})

    lines1 = kafkaStreams_lines.map(lambda x: x[1])  # 注意 取tuple下的第二个即为接收到的kafka流

    words = lines1.flatMap(lambda line: line.split(" "))

    pairs = words.map(lambda word: (word, 1))

    wordcounts = pairs.reduceByKey(lambda x, y: x + y)

    wordcounts.saveAsTextFiles("/var/lib/hadoop-hdfs/spark-libin/kafka")

    wordcounts.pprint()
    # 统计生成的随机数的分布情况
    ssc.start()  # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
Example #2
0
    def __connected_yarn_spark_cluster(self, pilotcompute_description):

        number_cores=1
        if pilotcompute_description.has_key("number_cores"):
            number_cores=int(pilotcompute_description["number_cores"])
        
        number_of_processes = 1
        if pilotcompute_description.has_key("number_of_processes"):
            number_of_processes = int(pilotcompute_description["number_of_processes"])

        executor_memory="1g"
        if pilotcompute_description.has_key("number_of_processes"):
            executor_memory = pilotcompute_description["physical_memory_per_process"]

        conf = SparkConf()
        conf.set("spark.num.executors", str(number_of_processes))
        conf.set("spark.executor.instances", str(number_of_processes))
        conf.set("spark.executor.memory", executor_memory)
        conf.set("spark.executor.cores", number_cores)
        if pilotcompute_description!=None:
            for i in pilotcompute_description.keys():
                if i.startswith("spark"):
                    conf.set(i, pilotcompute_description[i])
        conf.setAppName("Pilot-Spark")
        conf.setMaster("yarn-client")
        sc = SparkContext(conf=conf)
        sqlCtx = SQLContext(sc)
        pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx)
        return pilot
 def setUp(self):
     conf = SparkConf().setAppName('testing').setMaster('local[2]').set('spark.driver.host', 'localhost')
     conf.set('spark.ui.showConsoleProgress', False)
     self.session = SparkSession.builder.config(conf=conf).getOrCreate()
     self.test_data = [
         ('Ricardo', 'engineering', 2),
         ('Tisa', 'sales', 3),
         ('Sheree', 'marketing', 4), 
         ('Chantelle', 'engineering', 5),
         ('Kylee', 'finance', 2),
         ('Tamatha', 'marketing', 5),
         ('Trena', 'engineering', 2),
         ('Arica', 'engineering', 1),
         ('Santina', 'finance', 2),
         ('Daria', 'marketing', 1),
         ('Magnolia', 'sales', 2),
         ('Antonina', 'finance', 1),
         ('Sumiko', 'engineering', 1),
         ('Carmen', 'sales', 2),
         ('Delois', 'engineering', 1),
         ('Luetta', 'marketing', 3),
         ('Yessenia', 'sales', 1),
         ('Petra', 'engineering', 3),
         ('Charisse', 'engineering', 4),
         ('Lillian', 'engineering', 3),
         ('Wei', 'engineering', 2),
         ('Lahoma', 'sales', 2),
         ('Lucilla', 'marketing', 1),
         ('Stephaine', 'finance', 2),
     ]
def configureSpark():
	conf = SparkConf()
	conf.setMaster("local")
	conf.setAppName("Apache Spark Alarm Parser")
	conf.set("spark.executor.memory", "1g")
	sc = SparkContext(conf = conf)
	return sc
Example #5
0
    def setUpClass(cls):

        class_name = cls.__name__
        conf = SparkConf()
        conf.set('spark.app.name', 'class_name')

        # Read the spark configuration and update the spark conf
        test_spark_config = ConfigParser.ConfigParser()
        test_spark_config.read('test_config.cfg')
        test_spark_config.sections()
        configs = dict(test_spark_config.items('spark_conf_test_generic'))
        for k, v in configs.items():
            conf.set(k, v)
        cls.spark_test_configs = configs
        # Create the spark context
        cls.sc = SparkContext(conf=conf)
        if 'PYSPARK_DRIVER_PYTHON' in configs.keys():
            cls.sc.pythonExec = configs['PYSPARK_DRIVER_PYTHON']
        else:
            cls.sc.pythonExec = 'python2.7'

        logger = cls.sc._jvm.org.apache.log4j
        logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
        logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)

        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s: %(message)s')
        cls.logger = logging.getLogger(__name__)
        cls.logger.setLevel(logging.DEBUG)
class SparkContextFactory:
  def __init__(self):
    # not sure why windows environment variable can't be read, I set it 
    ##os.environ["SPARK_HOME"] = "C:\Spark"
    # not sure why windows environment variable can't be read, I set it 
    ##os.environ["HADOOP_CONF_DIR"] = "C:\hdp\bin"
    ##sys.path.append("C:\Spark\python")
    ##sys.path.append("C:\Spark\bin")

    # specify spark home
    os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark"
    # specify pyspark path so its libraries can be accessed by this application
    sys.path.append("/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark/python")
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SQLContext

    self.conf = SparkConf().setMaster("yarn-client")
    self.conf.setAppName("MrT")
    self.conf.set("spark.executor.memory", "5g")
    self.conf.set("spark.driver.memory", "10g")

    self.sc = SparkContext(conf = self.conf, pyFiles =
    ["ComputeCovHistory.py", "go.py", "risk_DSconvert.py", "ewstats.py", "ewstatsRDD.py", "ewstatswrap.py"])

    """
    toDF method is a monkey patch executed inside SQLContext constructor
    so to be able to use it you have to create a SQLContext first
    """
    self.sqlContextInstance = SQLContext(self.sc)


  def disconnect(self):
    self.sc.stop()
Example #7
0
def get_default_spark_conf():
    conf = SparkConf(). \
        setAppName("pyunit-test"). \
        setMaster("local-cluster[3,1,2048]"). \
        set("spark.ext.h2o.disable.ga","true"). \
        set("spark.driver.memory", "2g"). \
        set("spark.executor.memory", "2g"). \
        set("spark.ext.h2o.client.log.level", "DEBUG"). \
        set("spark.ext.h2o.repl.enabled", "false"). \
        set("spark.task.maxFailures", "1"). \
        set("spark.rpc.numRetries", "1"). \
        set("spark.deploy.maxExecutorRetries", "1"). \
        set("spark.network.timeout", "360s"). \
        set("spark.worker.timeout", "360"). \
        set("spark.ext.h2o.backend.cluster.mode", ExternalClusterTestHelper.cluster_mode()). \
        set("spark.ext.h2o.cloud.name", ExternalClusterTestHelper.unique_cloud_name("test")). \
        set("spark.ext.h2o.external.start.mode", os.getenv("spark.ext.h2o.external.start.mode", "manual")) .\
        set("spark.sql.warehouse.dir", "file:" + os.path.join(os.getcwd(), "spark-warehouse"))


    if ExternalClusterTestHelper.tests_in_external_mode():
        conf.set("spark.ext.h2o.client.ip", ExternalClusterTestHelper.local_ip())
        conf.set("spark.ext.h2o.external.cluster.num.h2o.nodes", "2")

    return conf
Example #8
0
 def getSparkContext(self, appName, master):
     print(appName)
     print(master)
     conf = SparkConf().setAppName(appName).setMaster(master)
     conf.set("spark.local.ip", "127.0.0.1")
     conf.set("spark.driver.host", "127.0.0.1")
     return SparkContext(conf=conf)
Example #9
0
def main():
    """
    Main entry point of the application
    """

    # Create spark configuration and spark context
    include_path = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'preprocessing.py'))
    conf = SparkConf()
    conf.set('spark.executor.memory', '1500m')
    conf.setAppName("Generating predictions")
    sc = SparkContext(conf=conf, pyFiles=[include_path])

    # Set S3 configuration
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", os.environ['AWS_ACCESS_KEY'])
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", os.environ['AWS_SECRET_KEY'])

    # Single-pass predictions
    fast_predict(sc, file_input="s3n://twitter-stream-data/twitter-*",
                 file_output="s3n://twitter-stream-predictions/final",
                 sports_model="PyTwitterNews/models/sports.model",
                 politics_model="PyTwitterNews/models/politics.model",
                 technology_model="PyTwitterNews/models/technology.model")

    # Stop application
    sc.stop()
def main():
	conf = SparkConf()
	conf.set("spark.default.parallelism", "24")
	sc = SparkContext(appName="PhoneLab Preprocessing", conf=conf)

	lines = sc.textFile(data_files, use_unicode=False)

	# Create LogLine objects and filter out empty lines
	logs = lines.flatMap(ll_mapper)

	# Save in an intermediate format
	logs.saveAsTextFile(out_dir, compressionCodecClass=codec)
	return

	# Gap detection
	keyed = logs.map(ll_gap_map)
	merged = keyed.groupByKey()

	# At this point we have ((boot_id, date), [line_num]) tuples The last step.
	# is to find all the gaps within each key/tuple.
	result = merged.flatMap(find_gaps)
	gaps = result.collect()

	fd = open("/spark/gaps.json", 'w')
	fd.write(json.dumps(gaps, indent=4))
Example #11
0
    def start_spark(self,
                    spark_conf=None, 
                    executor_memory=None,
                    profiling=False, 
                    graphframes_package='graphframes:graphframes:0.3.0-spark2.0-s_2.11', 
                    extra_conf = None):
        """Launch a SparkContext 
        
        Parameters

        spark_conf: path
            path to a spark configuration directory
        executor_memory: string
            executor memory in java memory string format, e.g. '4G'
            If `None`, `memory_per_executor` is used. 
        profiling: boolean
            whether to turn on python profiling or not
        graphframes_package: string
            which graphframes to load - if it isn't found, spark will attempt to download it
        extra_conf: dict
            additional configuration options
        """

        os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages {graphframes_package} pyspark-shell"\
                                            .format(graphframes_package=graphframes_package)
        
        if spark_conf is None:
            spark_conf = os.path.join(os.environ['SPARK_HOME'], 'conf')

        os.environ['SPARK_CONF_DIR'] = os.path.realpath(spark_conf)

        os.environ['PYSPARK_PYTHON'] = sys.executable

        try: 
            import findspark; findspark.init()
            from pyspark import SparkContext, SparkConf
        except ImportError: 
            raise ImportError("Unable to find pyspark -- are you sure SPARK_HOME is set?")

        conf = SparkConf()

        conf.set('spark.driver.maxResultSize', '0')

        if executor_memory is None: 
            executor_memory = '%dM'%self.memory_per_executor

        conf.set('spark.executor.memory', executor_memory)

        if profiling: 
            conf.set('spark.python.profile', 'true')
        else:
            conf.set('spark.python.profile', 'false')
        
        if extra_conf is not None: 
            for k,v in extra_conf.items(): 
                conf.set(k,v)

        sc = SparkContext(master=self.master_url(), conf=conf)

        return sc    
Example #12
0
def main():
    # Setting the cluster configuration parameters
    conf = SparkConf()
    conf.setMaster("spark://localhost:7077")
    conf.setAppName("Tweet App")
    conf.set("spark.executor.memory", "3g")
    conf.set("spark.driver.memory", "4g")

    # Creating a Spark Context with conf file
    sc = SparkContext(conf=conf)

    # Creating and SQL context to perform SQL queries
    sqlContext = SQLContext(sc)

    # Define the data path
    curr_path = os.path.dirname(os.path.abspath(__file__))
    json_name = "out.json"

    json_file_path = os.path.join(curr_path +
                                  "/../Spark_Jobs/data/",
                                  json_name)

    parquet_file_path = createSQLContext(json_file_path, sqlContext)
    print(parquet_file_path)

    # Read from parquet file
    parquetFile = sqlContext.read.parquet(parquet_file_path)
    parquetFile.registerTempTable("tweets")
    counter = sqlContext.sql("SELECT count(*) as cnt FROM tweets")
    print("============= Count =================")
    print("Count:: " + str(counter.collect()[0].cnt))
Example #13
0
def start():
    sconf = SparkConf()
    sconf.set('spark.cores.max', 2)
    sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
    ssc = StreamingContext(sc, 2)

    brokers = "localhost:9092"
    topics = ['test']

    kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})

    lines1 = kafkaStreams_lines.map(lambda x: x[1])  # 注意 取tuple下的第二个即为接收到的kafka流

    words = lines1.flatMap(lambda line: line.split(" "))

    pairs = words.map(lambda word: (word, 1))

    wordcounts = pairs.reduceByKey(lambda x, y: x + y)

    print(wordcounts)

    kafkaStreams_lines.transform(storeOffsetRanges).foreachRDD(printOffsetRanges)

    wordcounts.pprint()
    # 统计生成的随机数的分布情况
    ssc.start()  # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
Example #14
0
def main():
    spark_conf = SparkConf().setAppName("Different-Sampling data").setMaster('local[*]')
    spark_conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    sc = SparkContext(conf= spark_conf)
    GA.logInConsole(0, "input file read!")
    rdd = sc.textFile("/home/fatemeh/Data/saveData.txt",  minPartitions= 500, use_unicode=False)
    rdd.unpersist()
#     print('\nNumber of Partitions for this run: ', rdd.getNumPartitions())
    vectorRDD = rdd.map(lambda line: toVector(line, splitter = ' '))
    
    GA.logInConsole(0 , "Data Vectorized!")
    ss = list()
    GA.logInConsole(-1, 'Start the ensemble')
    GA.logInConsole(-10, "GA with range 3")
    ss.append(GA.parallel_GA_main(vectorRDD,sc, 5))
#     GA.logInConsole(-10, "GA with range 4")
#     ss.append(GA.parallel_GA_main(norm,sc, 4))
#     GA.logInConsole(-10, "GA with range 5")
#     ss.append(GA.parallel_GA_main(norm,sc, 5))
#     GA.logInConsole(-10, "GA with range 3 and Sampled data set")
#    sampleRDD = norm.sample(False, 0.6, seed=10)
#    ss.append(GA.parallel_GA_main(sampleRDD,sc, 3))
    print(ss)
    #selectedSS = voted_subsapces(ss)
#     SSD.outlierDetection(vectorRDD, ss)
    GA.logInConsole(100, "\nend of program")
    sc.stop()
Example #15
0
 def _test_broadcast_on_driver(self, *extra_confs):
     conf = SparkConf()
     for key, value in extra_confs:
         conf.set(key, value)
     conf.setMaster("local-cluster[2,1,1024]")
     self.sc = SparkContext(conf=conf)
     bs = self.sc.broadcast(value=5)
     self.assertEqual(5, bs.value)
Example #16
0
class OWSparkContext(SharedSparkContext, widget.OWWidget):
    priority = 0
    name = "Context"
    description = "Create a shared Spark (sc) and Hive (hc) Contexts"
    icon = "../icons/spark.png"

    want_main_area = False
    resizing_enabled = True

    conf = None

    def __init__(self):
        super().__init__()

        # The main label of the Control's GUI.
        # gui.label(self.controlArea, self, "Spark Context")

        self.conf = SparkConf()
        all_prefedined = dict(self.conf.getAll())
        # Create parameters Box.
        box = gui.widgetBox(self.controlArea, "Spark Application", addSpace = True)

        self.gui_parameters = OrderedDict()

        main_parameters = OrderedDict()
        main_parameters['spark.app.name'] = 'OrangeSpark'
        main_parameters['spark.master'] = 'yarn-client'
        main_parameters["spark.executor.instances"] = "8"
        main_parameters["spark.executor.cores"] = "4"
        main_parameters["spark.executor.memory"] = "8g"
        main_parameters["spark.driver.cores"] = "4"
        main_parameters["spark.driver.memory"] = "2g"
        main_parameters["spark.logConf"] = "false"
        main_parameters["spark.app.id"] = "dummy"

        for k, v in main_parameters.items():
            default_value = all_prefedined.setdefault(k, v)
            self.gui_parameters[k] = GuiParam(parent_widget = box, label = k, default_value = v)
            all_prefedined.pop(k)

        for k, v in all_prefedined.items():
            self.gui_parameters[k] = GuiParam(parent_widget = box, label = k, default_value = str(v))

        action_box = gui.widgetBox(box)
        # Action Button
        self.create_sc_btn = gui.button(action_box, self, label = 'Submit', callback = self.create_context)

    def onDeleteWidget(self):
        if self.sc:
            self.sc.stop()

    def create_context(self):

        for key, parameter in self.gui_parameters.items():
            self.conf.set(key, parameter.get_value())

        self.sc = SparkContext(conf = self.conf)
        self.hc = HiveContext(self.sc)
Example #17
0
def create_spark_context(app_name="Quiz Bowl", lm_memory=False, profile=False):
    spark_conf = SparkConf()
    if lm_memory:
        pass
        # spark_conf = spark_conf.set('spark.max.cores', 30).set('spark.executor.cores', 30)
    if profile:
        spark_conf = spark_conf.set('spark.python.profile', True)
    spark_conf = spark_conf.set('spark.akka.frameSize', 300)
    return SparkContext(appName=app_name, master=QB_SPARK_MASTER, conf=spark_conf)
def main():
    spark_conf = SparkConf().setAppName("Different-Sampling data")
    spark_conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    sc = SparkContext(conf= spark_conf)
    rdd = load_data(sc)  
    print(rdd.getNumPartitions())
    parallel_GA_main(sc, rdd, 5)
    
    sc.stop()
Example #19
0
def init_spark_context():
    # load spark context
    conf = SparkConf().setAppName("ctr-server")
    conf.set('spark.kryoserializer.buffer', '512mb')
    conf.set('spark.kryoserializer.buffer.max', '512')
    # IMPORTANT: pass aditional Python modules to each worker
    sc = SparkContext(conf=conf, pyFiles=['/home/ec2-user/engine.py', '/home/ec2-user/app.py'])
 
    return sc
Example #20
0
    def _configure_spark(self):
        logger.info('Configuring Spark')
        sconf = SparkConf()
        for prop, value in self.sm_config['spark'].iteritems():
            if prop.startswith('spark.'):
                sconf.set(prop, value)

        self.sc = SparkContext(master=self.sm_config['spark']['master'], conf=sconf, appName='SM engine')
        if not self.sm_config['spark']['master'].startswith('local'):
            self.sc.addPyFile(join(local_path(proj_root()), 'sm.zip'))
Example #21
0
def createSparkConf():
	from pyspark import SparkConf
	test_properties = conftest.test_properties()

	conf = SparkConf()
	conf.set("cloudant.host", test_properties["cloudanthost"])
	conf.set("cloudant.username", test_properties["cloudantusername"])
	conf.set("cloudant.password", test_properties["cloudantpassword"])
	
	return conf
Example #22
0
def main():
    # Spark Configurations
    conf = SparkConf()
    conf.set("spark.master", "local[*]")
    conf = conf.setAppName('Learning PySpark')
    sc = SparkContext(conf=conf)
    df = sc\
        .textFile('IXQ_20170622080001.csv')\
        .map(lambda line: line.split(','))
    print(df.take(5))
Example #23
0
def init_spark_context():
    # load spark context
    conf = SparkConf().setAppName("event-contour-server")
    conf.setMaster("local[4]")
    conf.setAppName("reduce")
    conf.set("spark.executor.memory", "4g")
    # IMPORTANT: pass aditional Python modules to each worker
    sc = SparkContext(conf=conf, pyFiles=['app.py', 'contourGenerator.py','EventParallelize.py'])
 
    return sc
Example #24
0
def home3(request):
	#spark_home = os.environ['SPARK_HOME'] = '/usr/local/spark-1.5.2-bin-2.7.1/' #'/usr/local/spark/'
	#sys.path.insert(0,os.path.join(spark_home,'python'))
	#sys.path.insert(0,os.path.join(spark_home,'python/lib/py4j-0.8.2.1-src.zip'))
	
	#from pyspark import SparkContext, SparkConf	
	#sc = SparkContext()
	#data=[1,2,3,4,5]
	#distData = sc.parallelize(data)
	#first = distData.take(1)
	#sc.stop()

	prefs = ["worldnews","politics","Economics","Libertarian"]
	
	scfg=SparkConf()
	scfg.set("spark.cores.max",64)
        sc=SparkContext(master="spark://final-gateway:7077", appName="reddit-cf", conf=scfg)

        #data=[1,2,3,4,5]
        #distData = sc.parallelize(data)
        #first = distData.take(1)
        #sc.stop()

	try:

                # prep data
                raw_counts = sc.textFile("hdfs://final-gateway/w251_cf-user-site-total")
                parsed_counts = raw_counts.map(lambda st: eval(st))
                all_ratings = parsed_counts.map( tup_to_rating )
                # assign user-identified preferred subreddits
                raw_prefs = [ (999, x, 100) for x in prefs ]
                my_prefs = sc.parallelize(raw_prefs).map(tup_to_rating)

                # train model
                model_input = all_ratings.union(my_prefs)
                #model = ALS.trainImplicit(model_input, 10, 10, alpha=.01)

                # candidate prefs for prediction
                #my_prefs_ids = set([javahash(x) for x in prefs])
                #all_subreddit_ids = parsed_counts.map( lambda (a,b,c): (javahash(b),b) ).distinct().cache()
                #candidates = all_subreddit_ids.map(lambda (a,b): a ).filter( lambda r: r not in my_prefs_ids)

                #predictions = model.predictAll(candidates.map( lambda x: (999, x))).cache()

                #final = predictions.map(lambda (a,b,c): (b,c)).join(all_subreddit_ids).map(lambda (b,(c,d)): (c,d) ).sortByKey(False)

                #output = list( final.take(30) )
                sc.stop()
                #return output
                recommends = ["asfd"] # output
        except Exception, e:
                print("App failed. Stopping gracefully")
                sc.stop()
                raise Exception(e)
Example #25
0
 def init(self):
     os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
     # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
     # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
     conf = SparkConf()
     conf.setMaster("local[10]")
     conf.setAppName("PySparkShell")
     conf.set("spark.executor.memory", "2g")
     conf.set("spark.driver.memory", "1g")
     self.sc = SparkContext(conf=conf)
     self.sqlContext = SQLContext(self.sc)        
Example #26
0
 def _test_multiple_broadcasts(self, *extra_confs):
     """
     Test broadcast variables make it OK to the executors.  Tests multiple broadcast variables,
     and also multiple jobs.
     """
     conf = SparkConf()
     for key, value in extra_confs:
         conf.set(key, value)
     conf.setMaster("local-cluster[2,1,1024]")
     self.sc = SparkContext(conf=conf)
     self._test_encryption_helper([5])
     self._test_encryption_helper([5, 10, 20])
def main():
    conf = SparkConf()
    conf.setAppName("TopAirports")
    conf.set("spark.streaming.kafka.maxRatePerPartition", "0")
    conf.set("spark.dynamicAllocation.enabled", "true")
    sc = SparkContext(conf = conf)
    ssc = StreamingContext(sc, 1) # Stream every 1 second
    ssc.checkpoint("checkpoint")

    # Clear the cassandra table
    init_cassandra().execute('TRUNCATE {}'.format(top_airports_table))

    stream_kafka(ssc)
Example #28
0
 def __connected_spark_cluster(self, resource_url, pilot_description=None):
     conf = SparkConf()
     conf.setAppName("Pilot-Spark")
     if pilot_description!=None:
         for i in pilot_description.keys():
             if i.startswith("spark"):
                 conf.set(i, pilot_description[i])
     conf.setMaster(resource_url)
     print(conf.toDebugString())
     sc = SparkContext(conf=conf)
     sqlCtx = SQLContext(sc)
     pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx)
     return pilot
Example #29
0
def create_spark_instance(master = "local", conf = None):
	"""
	master: default "local"
	conf: default 28 cores with 2g memory
	"""
	if not conf:
		conf = SparkConf()
		conf.set("spark.executor.memory", "2g")
		conf.set("spark.cores.max", "28")
		conf.setAppName("spark ipython notebook")

	spark_context = SparkContext(master, conf = conf)
	return spark_context
Example #30
0
class FireStarter():

  mappings = {
    'http_api': readers.HttpApi,
    'lighter': igniters.Lighter,
    'hdfs': writers.HadoopFileSystem
  }

  def __init__(self, config_file):
    self.config_file = config_file

  def read_config_file(self):
    with open(self.config_file, 'r+') as config_data:
      self.config_data = config_data.read()

  def parse_config_contents(self):
    self.config = json.loads(self.config_data)
    check_requirements = required_config - frozenset(self.config.keys())
    if check_requirements:
      raise ValueError('%s must contain %s' % (self.config_file, ', '.join(check_requirements)))

  def load_modules(self):
    """This loop initializes all of the readers,
    writers, and igniters then stores them in an array"""
    self.modules = OrderedDict()
    self.data = OrderedDict()

    for module in self.config['modules']:
      # Access the module via name, or by order
      new_module = self.modules[module['name']] = self.mappings[module['type']](**module['parameters'])
      self.data[module['name']] = new_module.data

  def create_spark_context(self):
    conf = self.config['spark_conf']
    self.spark_config = SparkConf()
    self.spark_config.setAppName(conf['app_name'])
    for attribute, value in conf['parameters'].items():
        self.spark_config.set(attribute, value)

    self.sc = SparkContext(conf = self.spark_config)

  def run_modules(self):
    for name, module in self.modules.items():
      module.execute()

  def execute(self):
    self.read_config_file()
    self.parse_config_contents()
    self.load_modules()
    self.run_modules()
Example #31
0
#!/usr/bin/env python
import sys
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("OrphanPages")
conf.set("spark.driver.bindAddress", "127.0.0.1")
sc = SparkContext(conf=conf)
lines = sc.textFile(sys.argv[1], 1)


#TODO
def getPages(line):
    line = line.rstrip()
    lines = line.split(":")
    p = lines[0].strip('\t\r\n\0 ')
    c = lines[1].strip('\t\r\n\0').split(" ")
    for val in c:
        if not val.isdigit():
            c.remove(val)
    res = [int(p)] + list(map(int, c))
    res[0] = -res[0]
    return res


def getVal(page):
    if page < 0:
        return (abs(page), 1)  #possible orphan
    else:
        return (page, 0)  #child

orphans = lines.flatMap(lambda line: getPages(line)) \
                .map(lambda p: getVal(p)) \
Example #32
0
    denom_ = math.sqrt(len(bus_features)) * math.sqrt(len(user_features))
    return num_ / denom_


if __name__ == "__main__":
    start = time.time()
    # input_fp = sys.argv[1]
    # model_fp = sys.argv[2]
    # output_fp = sys.argv[3]

    input_fp = "./data/test_review.json"
    model_fp = "./data/task2.model"
    output_fp = "./data/task2.predict"

    conf = SparkConf()
    conf.set("spark.executor.memory", "8g")
    conf.set("spark.driver.memory", "8g")
    sc = SparkContext(conf=conf)

    # Read in test data:
    test_rdd = sc.textFile(input_fp)
    user_business_rdd = test_rdd \
        .map(lambda x: json.loads(x)) \
        .map(lambda u_b: (u_b["user_id"], u_b["business_id"]))

    # Read in model components:
    model = sc.textFile(model_fp)
    business_profiles = model \
        .map(lambda x: json.loads(x)) \
        .map(lambda x: x["business"]) \
        .flatMap(lambda x: [(k["business_id"], k["feature_vector"]) for k in x]) \
Example #33
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
import pickle
from imutils.face_utils import FaceAligner
from imutils.face_utils import rect_to_bb
import numpy as np
import imutils
import dlib
import cv2
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

conf = SparkConf().setAppName("drunk streaming v2").setMaster("yarn")
conf.set("spark.scheduler.mode", "FAIR")
conf.set("spark.scheduler.allocation.file",
         "/opt/spark-2.4.3-bin-hadoop2.7/conf/fairscheduler.xml")
sc = SparkContext(conf=conf)
sc.setLocalProperty("spark.scheduler.pool", "pool2")
ssc = StreamingContext(sc, 0.2)
sql_sc = SQLContext(sc)
input_topic = 'input'
output_topic = 'output2'
brokers = "G01-01:2181,G01-02:2181,G01-03:2181,G01-04:2181,G01-05:2181,G01-06:2181,G01-07:2181,G01-08:2181," \
          "G01-09:2181,G01-10:2181,G01-11:2181,G01-12:2181,G01-13:2181,G01-14:2181,G01-15:2181,G01-16:2181"


def my_decoder(s):
    return s
Example #34
0
        d['count'] = 1
        d = d.groupby(col,as_index=False)['count'].sum()
        d = d[d['count']>=10]
        freqfeas[col] = set(d[col])

def logFun(x):
    #x = int(1000*x)
    x = int(x)
    if x<2:
        return "sp"+str(x)
    else:
        return str(int(math.log(float(x))**2))

os.environ["SPARK_HOME"] = "/home/hadoop/spark-2.0.1-bin-hadoop2.7"   #KeyError: 'SPARK_HOME'
conf = SparkConf()
conf.set("spark.hadoop.validateOutputSpecs", "false")
conf.setMaster('spark://master:7077')
sc=SparkContext(appName='Tpai',conf=conf)
sc.setLogLevel('warn')
sqlContext = SQLContext(sc)


train = sqlContext.read.format('com.databricks.spark.csv') \
    .options(header='true', charset="utf-8") \
    .load('hdfs://192.168.1.118:9000/home/hadoop/dup/train_xgb113U.csv')

df = sqlContext.read.format('com.databricks.spark.csv') \
    .options(header='true', charset="utf-8") \
    .load('hdfs://192.168.1.118:9000/home/hadoop/dup/train_xgb113U_df.csv')

y = train[['label']]
Example #35
0
import re
import sys


def part2(context):
    file = context[0]
    words = re.sub('[^a-z0-9]+',' ',context[1].lower()).split()
    file = file.split("/")[-1]
    return (file,words)


#configuring spark
conf = SparkConf()
conf.setAppName( "part2_uni" )
conf.set("spark.executor.memory", "2g")
sc = SparkContext(conf = conf)

#reading input
lines =sc.wholeTextFiles("/cosc6339_s17/books-longlist/")
#configuring SparkSession
spark=SparkSession(sc)
hasattr(lines, "toDF")

#tokeinizing the words and converting into dataframes
tokenize=lines.map(part2).toDF(["bookname", "words"])

#converting into unigrams
unigram = NGram(n=1, inputCol = "words", outputCol = "unigrams")
unigramdataframe = unigram.transform(tokenize)
Example #36
0
    return (new_vals0 + last_vals0,\
            new_vals1 + last_vals1)


######
###### Main script #######
######

signal.signal(signal.SIGINT, signal_handler)

dynamo = dynamodb2.connect_to_region(AWS_REGION)
out_table = Table(DB_TABLE, connection=dynamo)

config = SparkConf()
config.set('spark.streaming.stopGracefullyOnShutdown', True)
#config.set('spark.yarn.executor.memoryOverhead', '2g')

sc = SparkContext(appName='g2ex1', conf=config, pyFiles=['flight.py'])
ssc = StreamingContext(sc, 1)
ssc.checkpoint('/tmp/g2ex1')

lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))

filtered = lines.map(lambda line: line.split(","))\
                .map(lambda fields: Flight(fields))\
                .filter(lambda fl: fl.Cancelled == 0)\
                .map(lambda fl: ((fl.Origin, fl.UniqueCarrier), (fl.DepDelay, 1)))\
                .updateStateByKey(updateFunction)

filtered.foreachRDD(lambda rdd: rdd.foreachPartition(save_partition))
    records = spark.sql(sql)
    return records


def transform_to_row(t):
    app_package, status = t[0].split('sweeroty')
    return Row(app_package=app_package, status=int(status), count=int(t[1]))


if __name__ == '__main__':
    print('====> Initializing Spark APP')
    localConf = RawConfigParser()
    localConf.read('../config')
    sparkConf = SparkConf()
    for t in localConf.items('spark-config'):
        sparkConf.set(t[0], t[1])
    spark = SparkSession.builder \
      .appName('RLab_Stats_Report___Prepare_APP_Installment_Stats') \
      .config(conf=sparkConf) \
      .enableHiveSupport() \
      .getOrCreate()
    sc = spark.sparkContext
    sc.setLogLevel('ERROR')

    print('====> Parsing local arguments')
    parser = argparse.ArgumentParser()
    parser.add_argument('--query_month', type=str)
    args = parser.parse_args()
    fr = args.query_month + '01'
    to = args.query_month + str(
        monthrange(int(args.query_month[:4]), int(args.query_month[4:]))[1])
Example #38
0
from pyspark import SparkConf, SparkContext
from operator import add
from pyspark.sql import SQLContext
from nltk.corpus import stopwords
import re
import string
import nltk
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

conf = SparkConf()
conf.setAppName("Inverted Index")
conf.set("spark.ui.port", "4091")
conf.set("spark.executor.memory", "2g")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
spark = SparkSession(sc)

path1 = spark.read.load(
    "/bigd45/out312/part-00000-16a6ace5-c303-44c5-aa2e-9b2a0f4259ac-c000.snappy.parquet"
)

removePunct = (lambda x: x not in string.punctuation)
path = "/cosc6339_hw2/gutenberg-500/"
finalWords = []
out = path1.collect()
for (count, word) in out:
    out1 = word
    finalWords.append(out1)

rdd = sc.wholeTextFiles(path)
    print(pairs_rdd.collect())

    frequencies_rdd = pairs_rdd.reduceByKey(lambda a, b: a + b)
    print(frequencies_rdd.collect())

    # filter out words with fewer than threshold occurrences
    filtered_rdd = frequencies_rdd.filter(lambda
                                          (word, count): count >= threshold)
    print(filtered_rdd.collect())


if __name__ == '__main__':

    conf = SparkConf()
    conf.setAppName("WordCount")
    conf.set('spark.executor.memory', '500M')
    conf.set('spark.cores.max', 4)
    try:
        sc = SparkContext(conf=conf)
    except:
        print("Failed to connect!")
        print(sys.exc_info()[0])

    #   sys.argv[0] is the name of the script.
    #   sys.argv[1] is the first parameter: filename
    #   sys.argv[2] is the second parameter: threshold
    input_path = sys.argv[1]  # "file:///Users/mparsian/sample.txt"
    print("input_path: {}".format(input_path))

    # get threshold
    threshold = int(sys.argv[2])
Example #40
0
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

# Create a spark configuration
conf = SparkConf()

# set client
conf.setMaster('local')
# set app name
conf.setAppName("Some spark")
# spark config
conf.set("spark.cores.max", "1")
# spak config
conf.set("spark.executor.memory", "1g")

# Create spark context
sc = SparkContext(conf=conf)

# Create a labeled point with a positive label and a dense feature vector.
pos = LabeledPoint(1.0, [1.0, 0.0, 3.0])

# Create a labeled point with a negative label and a sparse feature vector.
neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0]))

print neg
                                      outputCol="features")

    pipeline = Pipeline(stages=[tokenizer, countVectorizer])
    model = pipeline.fit(df)
    documents = model.transform(df).select("features").rdd.map(
        lambda x: x.features).zipWithIndex().map(lambda x: [x[1], x[0]])
    return documents, model.stages[1].vocabulary


if __name__ == '__main__':
    start_time1 = time.time()
    args = sys.argv
    sconf = SparkConf()
    sconf.setAppName("lda")
    sconf.setMaster(args[1])
    sconf.set("spark.executor.memory", "6g")
    sconf.set("spark.driver.memory", "6g")
    sconf.set("spark.driver.maxResultSize", "6g")
    sconf.set("spark.yarn.executor.memoryOverhead", "2g")
    sconf.set("spark.yarn.driver.memoryOverhead", "2g")

    sconf.set("spark.eventLog.enabled", "true")
    sconf.set("spark.eventLog.dir",
              "hdfs://" + args[3] + "/user/" + args[4] + "/Logs/")
    sc = SparkContext(conf=sconf)
    dataset = "hdfs://" + args[3] + "/user/" + args[4] + "/In/" + args[2]
    corpus, vocabArray = preprocess(sc,
                                    path=dataset,
                                    vocabsize=50000,
                                    stopwordfile='')
Example #42
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author: [email protected]


from pyspark import SparkConf, SparkContext
import os
os.environ['PYSPARK_PYTHON'] = '/export/conda3/envs/pyspark-exp/bin/python3'

if __name__ == "__main__":
    ## 配置spark 参数
    conf = SparkConf()

    conf.set('spark.master', "local[*]") # yarn
    conf.set('spark.app.name', "word_count")

    ## 创建sc对象
    sc = SparkContext(conf=conf)

    # hello spark
    # hello hadoop
    # hello flink
    file_rdd = sc.textFile("hdfs:///data/word.txt")

    ## 打平, 得到存储单词集合的对象 :['hello', 'spark', 'hello', 'hadoop', 'hello', 'flink']
    words_rdd = file_rdd.flatMap(lambda line: line.split(" "))
    print(words_rdd.collect())

    ## 给每个对象map一个函数 :[('hello', 1), ('spark', 1), ('hello', 1), ('hadoop', 1), ('hello', 1), ('flink', 1)]
    map_rdd = words_rdd.map(lambda x: (x, 1))
    print(map_rdd.collect())
Example #43
0
from pyspark import SparkConf
from pyspark.sql import SparkSession
from operator import add
import sys
from pyspark import SparkContext

conf = SparkConf()
conf.setAppName('Assignment3')
conf.set('spark.executor.memory', '2g')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)


def function1(param):
    a = param[0]
    b = param[1]
    w = a[1] * b[1]
    return (a[0], b[0], w)


def function2(param):
    list = []
    for i in range(len(param)):
        for j in range(i + 1, len(param)):
            bc = (param[i], param[j])
            list.append(bc)

    return list


df1 = spark.read.format("com.databricks.spark.avro").load(
Example #44
0
# author(learning): Scc_hy
# original url: https://github.com/mahmoudparsian/pyspark-algorithms/blob/master/code/chap03/word_count.py
# create date: 2019-12-19
# function: word_count
# data:

import sys, os
from pyspark import SparkContext, SparkConf


def wordcount(sc: SparkContext, input_path: str) -> str:
    rdd = sc.textFile(input_path)
    word_rdd = rdd.flatMap(lambda l: l.split(' '))
    pair_rdd = word_rdd.map(lambda word: (word, 1))
    print(pair_rdd.reduceByKey(lambda a, b: a + b).collect())


if __name__ == '__main__':
    spk_conf = SparkConf()
    spk_conf.setAppName('WordCount').set('spark.executor.memory', '500M')
    spk_conf.set('spark.cores.max', 4)
    try:
        sc = SparkContext(conf=spk_conf)
        input_path = r'D:\My_Learn\pyspark\chap1\sample.txt'
    except:
        print("Failed to connect!")
        print(sys.exc_info()[0])

    # Execute word count
    wordcount(sc, input_path)
#imported the required packages
from pyspark import SparkContext, SparkConf
import numpy
from scipy import spatial

conf = SparkConf()
conf.setAppName('MovieRecommender')
conf.set("spark.executor.memory", "4g")
conf.set("spark.driver.memory", "4g")
conf.set("spark.eventLog.enabled", "true")
conf.set("spark.serializer ", "org.apache.spark.serializer.KryoSerializer")
sc = SparkContext(conf=conf)

#created rdd on movies and ratings dataset
movies_data = sc.textFile("/home/hadoop/movies.csv").map(
    lambda x: x.split(",")).map(lambda x: (x[0], x[1]))
ratings_data = sc.textFile("/home/hadoop/ratings.csv").map(
    lambda x: x.split(",")).map(lambda x: (x[1], (x[0], x[2])))

#got the userid,(movie title,rating) from two datasets
combineddata = movies_data.join(ratings_data)
requireddata = combineddata.map(lambda x: (x[1][1][0],
                                           (x[1][0], x[1][1][1]))).cache()

#used self join on the rdd
joinedRatings = requireddata.join(requireddata)


#to remove duplicates
def removeDuplicates(ratingvalues):
    ratings = ratingvalues[1]
Example #46
0
    max_request_threads = webconfig.getint("global",
                                           "server.max_simultaneous_requests")
    log.info("Initializing request ThreadPool to %s" % max_request_threads)
    request_thread_pool = ThreadPool(processes=max_request_threads)

    spark_context = None
    for clazzWrapper in NexusHandler.AVAILABLE_HANDLERS:
        if issubclass(clazzWrapper.clazz(), NexusHandler.SparkHandler):
            if spark_context is None:
                from pyspark import SparkContext, SparkConf

                # Configure Spark
                sp_conf = SparkConf()
                sp_conf.setAppName("nexus-analysis")
                sp_conf.set("spark.scheduler.mode", "FAIR")
                sp_conf.set("spark.executor.memory", "6g")
                spark_context = SparkContext(conf=sp_conf)

            handlers.append((clazzWrapper.path(), ModularNexusHandlerWrapper,
                             dict(clazz=clazzWrapper,
                                  algorithm_config=algorithm_config,
                                  sc=spark_context,
                                  thread_pool=request_thread_pool)))
        else:
            handlers.append((clazzWrapper.path(), ModularNexusHandlerWrapper,
                             dict(clazz=clazzWrapper,
                                  algorithm_config=algorithm_config,
                                  thread_pool=request_thread_pool)))

    class VersionHandler(tornado.web.RequestHandler):
Example #47
0
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import time
import os, sys
import logging

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

reload(sys)
sys.setdefaultencoding('utf-8')

conf = SparkConf().set('spark.driver.maxResultSize', '30g')
conf.set('spark.yarn.am.cores', 5)
conf.set('spark.executor.memory', '15g')
conf.set('spark.executor.instances', 30)
conf.set('spark.executor.cores', 8)
conf.set('spark.executor.extraJavaOptions',
         '-XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UseG1GC')

spark = SparkSession \
    .builder \
    .config(conf=conf) \
    .enableHiveSupport() \
    .getOrCreate()

from jg_info import vertex_table_info, edge_info

path_prefix = '/phoebus/_fileservice/users/slmp/shulianmingpin/midfile/open_phone'
Example #48
0
from bisect import *
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark import StorageLevel

reload(sys)
sys.setdefaultencoding('utf-8')

if __name__ == "__main__":
    # $example on:init_session$
    
    conf = SparkConf().setAppName("CreditCardInfo")
    conf.set('spark.cores.max',60)
    conf.set('spark.executor.memory','5g')
    conf.set('spark.rpc.askTimeout',240)
    conf.set('spark.driver.memory','20g')
    conf.set('spark.dynamicAllocation.enabled',True)
    conf.set('spark.shuffle.service.enabled',True)
    conf.set('spark.task.maxFailures',1)
    conf.set('spark.network.timeout','600s')
    conf.set("yarn.nodemanager.vmem-check-enabled","false")
    sc = SparkContext(conf=conf)
    
    sqlContext = SQLContext(sc)

    card_table = sqlContext.read.format("jdbc").option("url","jdbc:mysql://localhost/card_db").option("driver","com.mysql.jdbc.Driver").option("dbtable","card_info").option("user","root").option("password",**********).load()
    
    card_table.createOrReplaceTempView("card_table")
Example #49
0
    config["stage1File"] = filepath
    with open("config_{0}.json".format(index), "w") as outfile:
        json.dump(config, outfile, indent=4)
    subprocess.call(["./mdm", "config_{0}.json".format(index)])
    subprocess.call([
        "mv", "Final_MDM_{0}.root".format(index),
        "/hdfs/user/hjayatissa/geant_mdm_csi/stage2"
    ])
    subprocess.call([
        "/usr/local/hadoop/bin/hdfs", "dfs", "-chown", "hjayatissa",
        "/user/hjayatissa/geant_mdm_csi/stage2/Final_MDM_{0}.root".format(
            index)
    ])


if __name__ == "__main__":
    sconf = SparkConf().setAppName("mdm-CsI-2")
    sconf.set("spark.executor.memory", "13g")
    sconf.set("spark.python.worker.reuse", "false")
    sc = SparkContext(conf=sconf)

    sc.addFile("mdm")
    sc.addFile("config/config_isobutane_22Ne_6Li_geant_oxford.json")
    sc.addFile("run_oxf.mac")

    file_name = "hdfs://gr-gmaster.tamu.edu:9000//user/hjayatissa/geant_mdm_csi/stage1/MDM_*.root"
    lines = sc.newAPIHadoopFile(file_name, "edu.tamu.hadoop.RootInputFormat",
                                "org.apache.hadoop.io.IntWritable",
                                "org.apache.hadoop.io.Text")
    lines.foreach(lambda x: stage2(x))
Example #50
0
optimizer = 'adagrad'
loss = 'categorical_crossentropy'

addition = 0
master_port = 5000
send_port = 8000
master_port += addition
send_port += addition
print master_port
print send_port

chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabdefghijlmnqrtuwxy"
width, height, n_len, n_class = 140, 44, 6, len(chars) + 1

conf = SparkConf()
conf.set("spark.app.name", application_name)
conf.set("spark.master", master)
conf.set("spark.submit.deployMode", deploymode)
conf.set("spark.executor.cores", ` num_cores `)
conf.set("spark.executor.instances", ` num_executors `)
conf.set("spark.sql.warehouse.dir", "hdfs://master:9000/user/hive/warehouse")

###############################################################################
#from pyspark.sql import SparkSession
#sc = SparkSession.builder.master(master).appName(application_name).enableHiveSupport().getOrCreate()
#sqlContext = SQLContext(sc)
################################################################################
sc = SparkContext(conf=conf)
################################################################################

# 定义CTC模型,构造训练器
Example #51
0
#!/usr/bin/env python
# encoding: utf-8
'''
@author: fanyuexiang
@software: pycharm
@file: StreamingInit.py
@time: 2020/2/23 2:41 下午
@desc: 使用Python 初始化Spark Streaming,实现wordCount功能
'''
from pyspark.streaming import StreamingContext
from pyspark import SparkConf, SparkContext
import os

PYSPARK_PYTHON ="/Library/Frameworks/Python.framework/Versions/3.6/bin/python3"
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON

conf = SparkConf()
conf.set("spark.app.name", "init-streaming")
conf.set("spark.master", "local[2]")
sc = SparkContext(conf=conf)
streamingSc = StreamingContext(sparkContext=sc, batchDuration=1)
lines = streamingSc.socketTextStream(hostname="localhost", port=7777)
words = lines.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
count = pairs.reduceByKey(lambda x, y: x+y)
count.pprint()
streamingSc.start()
streamingSc.awaitTermination()

Example #52
0
from pyspark.sql.types import *

import sys
import time
import signal

import itertools
import cassandra

from cassandra.cluster import Cluster
from cassandra.query import named_tuple_factory
from flight import Flight
from itertools import islice, chain

config = SparkConf()
config.set("spark.streaming.stopGracefullyOnShutdown", "true")

filtered = None
ssc = None


def grouper_it(n, iterable):
    it = iter(iterable)
    while True:
        chunk_it = itertools.islice(it, n)
        try:
            first_el = next(chunk_it)
        except StopIteration:
            return
        yield itertools.chain((first_el, ), chunk_it)
Example #53
0
def spark():

    try:
        import pyspark
        from pyspark import SparkContext, SparkConf
        from pyspark.sql import SparkSession
        from pyspark.sql import types

        conf = SparkConf()

        conf.set("spark.sql.shuffle.partitions", "1")
        conf.set("spark.jars.ivy", "/home/jovyan/.ivy2/")
        conf.set("spark.driver.extraClassPath",
                 "jars/scala-udf-similarity-0.0.6.jar")
        conf.set("spark.jars", "jars/scala-udf-similarity-0.0.6.jar")
        conf.set("spark.driver.memory", "4g")
        conf.set("spark.sql.shuffle.partitions", "24")

        sc = SparkContext.getOrCreate(conf=conf)

        spark = SparkSession(sc)

        udfs = [
            ("jaro_winkler_sim", "JaroWinklerSimilarity", types.DoubleType()),
            ("jaccard_sim", "JaccardSimilarity", types.DoubleType()),
            ("cosine_distance", "CosineDistance", types.DoubleType()),
            ("Dmetaphone", "DoubleMetaphone", types.StringType()),
            ("QgramTokeniser", "QgramTokeniser", types.StringType()),
            ("Q3gramTokeniser", "Q3gramTokeniser", types.StringType()),
            ("Q4gramTokeniser", "Q4gramTokeniser", types.StringType()),
            ("Q5gramTokeniser", "Q5gramTokeniser", types.StringType()),
        ]

        for a, b, c in udfs:
            spark.udf.registerJavaFunction(a, "uk.gov.moj.dash.linkage." + b,
                                           c)
        SPARK_EXISTS = True
    except:
        SPARK_EXISTS = False

    if SPARK_EXISTS:
        print("Spark exists, running spark tests")
        yield spark
    else:
        spark = None
        logger.error("Spark not available")
        print("Spark not available")
        yield spark
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return values[:-1]


def parseLabel(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return values[-1]


def error(point, model):
    center = model.centers[model.predict(point)]
    return math.sqrt(sum([x**2 for x in (point - center)]))


conf = SparkConf()
conf.set("spark.master", "local")
sc = SparkContext(conf=conf)

data = sc.textFile("practice6_train.csv")
trData = data.map(parseFeat)

data = sc.textFile("practice6_test.csv")
tsData = data.map(parseFeat)
tsLabel = data.map(parseLabel)

kmeans_list = []
for i in range(30):
    kmeans_list.append(KMeans.train(trData, k=10, maxIterations=100, seed=i))

obj_list = []
for i in range(30):
Example #55
0
    def __init__(self,
                 sc=None,
                 app_name="Hail",
                 master=None,
                 local='local[*]',
                 log=None,
                 quiet=False,
                 append=False,
                 min_block_size=1,
                 branching_factor=50,
                 tmp_dir=None,
                 default_reference="GRCh37",
                 idempotent=False,
                 global_seed=6348563392232659379,
                 optimizer_iterations=None,
                 _backend=None):

        if Env._hc:
            if idempotent:
                return
            else:
                raise FatalError(
                    'Hail has already been initialized, restart session '
                    'or stop Hail to change configuration.')

        if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"):
            hail_jar_path = pkg_resources.resource_filename(
                __name__, "hail-all-spark.jar")
            assert os.path.exists(
                hail_jar_path), f'{hail_jar_path} does not exist'
            sys.stderr.write(f'using hail jar at {hail_jar_path}\n')
            conf = SparkConf()
            conf.set('spark.driver.extraClassPath', hail_jar_path)
            conf.set('spark.executor.extraClassPath', hail_jar_path)
            SparkContext._ensure_initialized(conf=conf)
        else:
            SparkContext._ensure_initialized()

        self._gateway = SparkContext._gateway
        self._jvm = SparkContext._jvm

        # hail package
        self._hail = getattr(self._jvm, 'is').hail

        self._warn_cols_order = True
        self._warn_entries_order = True

        Env._jvm = self._jvm
        Env._gateway = self._gateway

        jsc = sc._jsc.sc() if sc else None

        if _backend is None:
            apiserver_url = os.environ.get('HAIL_APISERVER_URL')
            if apiserver_url is not None:
                _backend = ServiceBackend(apiserver_url)
            else:
                _backend = SparkBackend()
        self._backend = _backend

        tmp_dir = get_env_or_default(tmp_dir, 'TMPDIR', '/tmp')
        optimizer_iterations = get_env_or_default(optimizer_iterations,
                                                  'HAIL_OPTIMIZER_ITERATIONS',
                                                  3)

        version = read_version_info()
        hail.__version__ = version

        if log is None:
            log = hail.utils.timestamp_path(os.path.join(os.getcwd(), 'hail'),
                                            suffix=f'-{version}.log')
        self._log = log

        # we always pass 'quiet' to the JVM because stderr output needs
        # to be routed through Python separately.
        # if idempotent:
        if idempotent:
            self._jhc = self._hail.HailContext.getOrCreate(
                jsc, app_name, joption(master), local, log, True, append,
                min_block_size, branching_factor, tmp_dir,
                optimizer_iterations)
        else:
            self._jhc = self._hail.HailContext.apply(jsc, app_name,
                                                     joption(master), local,
                                                     log, True, append,
                                                     min_block_size,
                                                     branching_factor, tmp_dir,
                                                     optimizer_iterations)

        self._jsc = self._jhc.sc()
        self.sc = sc if sc else SparkContext(
            gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc))
        self._jsql_context = self._jhc.sqlContext()
        self._sql_context = SQLContext(self.sc, jsqlContext=self._jsql_context)

        super(HailContext, self).__init__()

        # do this at the end in case something errors, so we don't raise the above error without a real HC
        Env._hc = self

        ReferenceGenome._from_config(_backend.get_reference('GRCh37'), True)
        ReferenceGenome._from_config(_backend.get_reference('GRCh38'), True)
        ReferenceGenome._from_config(_backend.get_reference('GRCm38'), True)

        if default_reference in ReferenceGenome._references:
            self._default_ref = ReferenceGenome._references[default_reference]
        else:
            self._default_ref = ReferenceGenome.read(default_reference)

        jar_version = self._jhc.version()

        if jar_version != version:
            raise RuntimeError(
                f"Hail version mismatch between JAR and Python library\n"
                f"  JAR:    {jar_version}\n"
                f"  Python: {version}")

        if not quiet:
            sys.stderr.write('Running on Apache Spark version {}\n'.format(
                self.sc.version))
            if self._jsc.uiWebUrl().isDefined():
                sys.stderr.write('SparkUI available at {}\n'.format(
                    self._jsc.uiWebUrl().get()))

            connect_logger('localhost', 12888)

            self._hail.HailContext.startProgressBar(self._jsc)

            sys.stderr.write(
                'Welcome to\n'
                '     __  __     <>__\n'
                '    / /_/ /__  __/ /\n'
                '   / __  / _ `/ / /\n'
                '  /_/ /_/\\_,_/_/_/   version {}\n'.format(version))

            if version.startswith('devel'):
                sys.stderr.write(
                    'NOTE: This is a beta version. Interfaces may change\n'
                    '  during the beta period. We recommend pulling\n'
                    '  the latest changes weekly.\n')
            sys.stderr.write(f'LOGGING: writing to {log}\n')

        install_exception_handler()
        Env.set_seed(global_seed)
Example #56
0
local_path = os.path.dirname(__file__)
sys.path.append(local_path + "/../lib")
sys.path.append(local_path + "/../")
sys.path.append(local_path)

from pyspark import SQLContext, SparkConf, HiveContext
from pyspark import SparkContext

from ml import diff_feature_cls, diff_train_cls


def run(sc, sql_context, is_hive):
    diff_feature_cls.main(sc, sql_context, is_hive=True)
    diff_train_cls.main(sc, sql_context, is_hive=True)


if __name__ == "__main__":
    conf = SparkConf()
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "32g")

    sc = SparkContext(appName="bintrade_candidate",
                      master="yarn-client",
                      conf=conf)
    sqlContext = HiveContext(sc)
    sqlContext.setConf("spark.sql.shuffle.partitions", "32")

    sqlContext.sql("use fex")

    run(sc, sqlContext, is_hive=True)
Example #57
0
from pyspark import SparkContext, SparkConf
import numpy as np
conf = SparkConf()
conf.set('master', 'spark://hadoop-maste:7077')
context = SparkContext(conf=conf)
acc = context.accumulator(0)
print(type(acc), acc.value)
rdd = context.parallelize(np.arange(101), 5)


def acc_add(a):
    acc.add(a)
    return a


rdd2 = rdd.map(acc_add)
print(rdd2.collect())
print(acc.value)
context.stop()
Example #58
0
 def getConf(self):
     conf = SparkConf()
     for k, v in DEFAULT_SPARK_CONFIG.items():
         conf.set(k, v)
     return conf
Example #59
0
def papers_citations(sc):
	#papers and number of citations per year
	citations = sc.textFile("/corpora/corpus-microsoft-academic-graph/data/PaperReferences.tsv.bz2")
	citations = citations.map(lambda line : line.split("\t")).map(lambda c: (c[0], c[1]))

	papers = papers_newer_than(sc, 2013)
	papers = papers.map(lambda p: (p[0], p[3]))


	#join
	papersMap = sc.broadcast(papers.collectAsMap());

	rowFunc1 = lambda x: (x[0], x[1], papersMap.value.get(x[1], -1))
	def mapFunc1(partition):
		for row in partition:
			yield rowFunc1(row)


	result = citations.mapPartitions(mapFunc1, preservesPartitioning=True)
	result = result.filter(lambda c: c[2] != -1).map(lambda x: (x[0], x[1]))
	result.saveAsHadoopFile('/user/bd-ss16-g3/data/citations_2014', "org.apache.hadoop.mapred.TextOutputFormat", compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")

if __name__ == "__main__":
	# Configure OPTIONS
	conf = SparkConf().setAppName(APP_NAME)
	conf = conf.setMaster("yarn-client")
	conf = conf.set("spark.executor.memory", "25g").set("spark.driver.memory", "25g").set("spark.mesos.executor.memoryOverhead", "10000")
	sc   = SparkContext(conf=conf)
	#papers_citations(sc)
	papers_with_citations(sc)
        save_results(toSave)


def calculateAverage(newVal, accumlativeAvg):
    if accumlativeAvg is None:
        accumlativeAvg = (0.0, 0, 0.0)
    total = sum(newVal, accumlativeAvg[0])
    count = accumlativeAvg[1] + len(newVal)
    avg = total / float(count)
    return (prod, count, avg)


if __name__ == '__main__':
    conf = SparkConf()
    conf.setAppName("Problem_2-1")
    conf.set("spark.streaming.kafka.maxRatePerPartition", 50000)
    conf.set("spark.executor.memory", "2g")
    conf.set("spark.python.worker.memory", "1g")

    airports = ['CMI', 'BWI', 'MIA', 'LAX', 'IAH', 'SFO']

    sc = SparkContext(conf=conf)
    sc.setLogLevel("WARN")
    ssc = StreamingContext(sc, 2)
    ssc.checkpoint("/tmp/streaming")

    brokers = "b-1.cs598-tast2.n69c9p.c2.kafka.us-east-1.amazonaws.com:9092,b-2.cs598-tast2.n69c9p.c2.kafka.us-east-1.amazonaws.com:9092"
    topic = "cs598-task2 "
    kafka_consumer_group = str(uuid.uuid4())
    kafka_client_params = {
        "metadata.broker.list": brokers,