Python SparkConf.set Examples, pyspark.SparkConf.set Python Examples

Example #1

0

Show file

File: kafka_streaming_direct.py Project: blair1/hadoop-spark

def start():
    sconf = SparkConf()
    sconf.set('spark.cores.max', 2)
    sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
    ssc = StreamingContext(sc, 2)

    brokers = "192.192.0.27:9092"
    topics = ['topic7']

    kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})

    lines1 = kafkaStreams_lines.map(lambda x: x[1])  # 注意 取tuple下的第二个即为接收到的kafka流

    words = lines1.flatMap(lambda line: line.split(" "))

    pairs = words.map(lambda word: (word, 1))

    wordcounts = pairs.reduceByKey(lambda x, y: x + y)

    wordcounts.saveAsTextFiles("/var/lib/hadoop-hdfs/spark-libin/kafka")

    wordcounts.pprint()
    # 统计生成的随机数的分布情况
    ssc.start()  # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate

Example #2

0

Show file

File: __init__.py Project: drelu/SAGA-Hadoop

    def __connected_yarn_spark_cluster(self, pilotcompute_description):

        number_cores=1
        if pilotcompute_description.has_key("number_cores"):
            number_cores=int(pilotcompute_description["number_cores"])
        
        number_of_processes = 1
        if pilotcompute_description.has_key("number_of_processes"):
            number_of_processes = int(pilotcompute_description["number_of_processes"])

        executor_memory="1g"
        if pilotcompute_description.has_key("number_of_processes"):
            executor_memory = pilotcompute_description["physical_memory_per_process"]

        conf = SparkConf()
        conf.set("spark.num.executors", str(number_of_processes))
        conf.set("spark.executor.instances", str(number_of_processes))
        conf.set("spark.executor.memory", executor_memory)
        conf.set("spark.executor.cores", number_cores)
        if pilotcompute_description!=None:
            for i in pilotcompute_description.keys():
                if i.startswith("spark"):
                    conf.set(i, pilotcompute_description[i])
        conf.setAppName("Pilot-Spark")
        conf.setMaster("yarn-client")
        sc = SparkContext(conf=conf)
        sqlCtx = SQLContext(sc)
        pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx)
        return pilot

Example #3

0

Show file

File: test_analysis_teacher.py Project: bfemiano/misc_scripts

 def setUp(self):
     conf = SparkConf().setAppName('testing').setMaster('local[2]').set('spark.driver.host', 'localhost')
     conf.set('spark.ui.showConsoleProgress', False)
     self.session = SparkSession.builder.config(conf=conf).getOrCreate()
     self.test_data = [
         ('Ricardo', 'engineering', 2),
         ('Tisa', 'sales', 3),
         ('Sheree', 'marketing', 4), 
         ('Chantelle', 'engineering', 5),
         ('Kylee', 'finance', 2),
         ('Tamatha', 'marketing', 5),
         ('Trena', 'engineering', 2),
         ('Arica', 'engineering', 1),
         ('Santina', 'finance', 2),
         ('Daria', 'marketing', 1),
         ('Magnolia', 'sales', 2),
         ('Antonina', 'finance', 1),
         ('Sumiko', 'engineering', 1),
         ('Carmen', 'sales', 2),
         ('Delois', 'engineering', 1),
         ('Luetta', 'marketing', 3),
         ('Yessenia', 'sales', 1),
         ('Petra', 'engineering', 3),
         ('Charisse', 'engineering', 4),
         ('Lillian', 'engineering', 3),
         ('Wei', 'engineering', 2),
         ('Lahoma', 'sales', 2),
         ('Lucilla', 'marketing', 1),
         ('Stephaine', 'finance', 2),
     ]

Example #4

0

Show file

File: spark_parser.py Project: ChinmaySKulkarni/Alarm_Tracker

def configureSpark():
	conf = SparkConf()
	conf.setMaster("local")
	conf.setAppName("Apache Spark Alarm Parser")
	conf.set("spark.executor.memory", "1g")
	sc = SparkContext(conf = conf)
	return sc

Example #5

0

Show file

File: test_sparkonda.py Project: bossjones/sparkonda

    def setUpClass(cls):

        class_name = cls.__name__
        conf = SparkConf()
        conf.set('spark.app.name', 'class_name')

        # Read the spark configuration and update the spark conf
        test_spark_config = ConfigParser.ConfigParser()
        test_spark_config.read('test_config.cfg')
        test_spark_config.sections()
        configs = dict(test_spark_config.items('spark_conf_test_generic'))
        for k, v in configs.items():
            conf.set(k, v)
        cls.spark_test_configs = configs
        # Create the spark context
        cls.sc = SparkContext(conf=conf)
        if 'PYSPARK_DRIVER_PYTHON' in configs.keys():
            cls.sc.pythonExec = configs['PYSPARK_DRIVER_PYTHON']
        else:
            cls.sc.pythonExec = 'python2.7'

        logger = cls.sc._jvm.org.apache.log4j
        logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
        logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)

        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s: %(message)s')
        cls.logger = logging.getLogger(__name__)
        cls.logger.setLevel(logging.DEBUG)

Example #6

0

Show file

File: risk_SparkContextFactory.py Project: howardx/pyspark

class SparkContextFactory:
  def __init__(self):
    # not sure why windows environment variable can't be read, I set it 
    ##os.environ["SPARK_HOME"] = "C:\Spark"
    # not sure why windows environment variable can't be read, I set it 
    ##os.environ["HADOOP_CONF_DIR"] = "C:\hdp\bin"
    ##sys.path.append("C:\Spark\python")
    ##sys.path.append("C:\Spark\bin")

    # specify spark home
    os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark"
    # specify pyspark path so its libraries can be accessed by this application
    sys.path.append("/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark/python")
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SQLContext

    self.conf = SparkConf().setMaster("yarn-client")
    self.conf.setAppName("MrT")
    self.conf.set("spark.executor.memory", "5g")
    self.conf.set("spark.driver.memory", "10g")

    self.sc = SparkContext(conf = self.conf, pyFiles =
    ["ComputeCovHistory.py", "go.py", "risk_DSconvert.py", "ewstats.py", "ewstatsRDD.py", "ewstatswrap.py"])

    """
    toDF method is a monkey patch executed inside SQLContext constructor
    so to be able to use it you have to create a SQLContext first
    """
    self.sqlContextInstance = SQLContext(self.sc)


  def disconnect(self):
    self.sc.stop()

Example #7

0

Show file

File: test_utils.py Project: seuwangcy/sparkling-water

def get_default_spark_conf():
    conf = SparkConf(). \
        setAppName("pyunit-test"). \
        setMaster("local-cluster[3,1,2048]"). \
        set("spark.ext.h2o.disable.ga","true"). \
        set("spark.driver.memory", "2g"). \
        set("spark.executor.memory", "2g"). \
        set("spark.ext.h2o.client.log.level", "DEBUG"). \
        set("spark.ext.h2o.repl.enabled", "false"). \
        set("spark.task.maxFailures", "1"). \
        set("spark.rpc.numRetries", "1"). \
        set("spark.deploy.maxExecutorRetries", "1"). \
        set("spark.network.timeout", "360s"). \
        set("spark.worker.timeout", "360"). \
        set("spark.ext.h2o.backend.cluster.mode", ExternalClusterTestHelper.cluster_mode()). \
        set("spark.ext.h2o.cloud.name", ExternalClusterTestHelper.unique_cloud_name("test")). \
        set("spark.ext.h2o.external.start.mode", os.getenv("spark.ext.h2o.external.start.mode", "manual")) .\
        set("spark.sql.warehouse.dir", "file:" + os.path.join(os.getcwd(), "spark-warehouse"))


    if ExternalClusterTestHelper.tests_in_external_mode():
        conf.set("spark.ext.h2o.client.ip", ExternalClusterTestHelper.local_ip())
        conf.set("spark.ext.h2o.external.cluster.num.h2o.nodes", "2")

    return conf

Example #8

0

Show file

File: WordCount2.py Project: seoeun25/spark-app

 def getSparkContext(self, appName, master):
     print(appName)
     print(master)
     conf = SparkConf().setAppName(appName).setMaster(master)
     conf.set("spark.local.ip", "127.0.0.1")
     conf.set("spark.driver.host", "127.0.0.1")
     return SparkContext(conf=conf)

Example #9

0

Show file

File: predict.py Project: alialavia/TwitterNews

def main():
    """
    Main entry point of the application
    """

    # Create spark configuration and spark context
    include_path = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'preprocessing.py'))
    conf = SparkConf()
    conf.set('spark.executor.memory', '1500m')
    conf.setAppName("Generating predictions")
    sc = SparkContext(conf=conf, pyFiles=[include_path])

    # Set S3 configuration
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", os.environ['AWS_ACCESS_KEY'])
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", os.environ['AWS_SECRET_KEY'])

    # Single-pass predictions
    fast_predict(sc, file_input="s3n://twitter-stream-data/twitter-*",
                 file_output="s3n://twitter-stream-predictions/final",
                 sports_model="PyTwitterNews/models/sports.model",
                 politics_model="PyTwitterNews/models/politics.model",
                 technology_model="PyTwitterNews/models/technology.model")

    # Stop application
    sc.stop()

Example #10

0

Show file

File: preprocess.py Project: gurupras/phonelab-postprocessing

def main():
	conf = SparkConf()
	conf.set("spark.default.parallelism", "24")
	sc = SparkContext(appName="PhoneLab Preprocessing", conf=conf)

	lines = sc.textFile(data_files, use_unicode=False)

	# Create LogLine objects and filter out empty lines
	logs = lines.flatMap(ll_mapper)

	# Save in an intermediate format
	logs.saveAsTextFile(out_dir, compressionCodecClass=codec)
	return

	# Gap detection
	keyed = logs.map(ll_gap_map)
	merged = keyed.groupByKey()

	# At this point we have ((boot_id, date), [line_num]) tuples The last step.
	# is to find all the gaps within each key/tuple.
	result = merged.flatMap(find_gaps)
	gaps = result.collect()

	fd = open("/spark/gaps.json", 'w')
	fd.write(json.dumps(gaps, indent=4))

Example #11

0

Show file

File: sparkjob.py Project: vinjana/sparkhpc

    def start_spark(self,
                    spark_conf=None, 
                    executor_memory=None,
                    profiling=False, 
                    graphframes_package='graphframes:graphframes:0.3.0-spark2.0-s_2.11', 
                    extra_conf = None):
        """Launch a SparkContext 
        
        Parameters

        spark_conf: path
            path to a spark configuration directory
        executor_memory: string
            executor memory in java memory string format, e.g. '4G'
            If `None`, `memory_per_executor` is used. 
        profiling: boolean
            whether to turn on python profiling or not
        graphframes_package: string
            which graphframes to load - if it isn't found, spark will attempt to download it
        extra_conf: dict
            additional configuration options
        """

        os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages {graphframes_package} pyspark-shell"\
                                            .format(graphframes_package=graphframes_package)
        
        if spark_conf is None:
            spark_conf = os.path.join(os.environ['SPARK_HOME'], 'conf')

        os.environ['SPARK_CONF_DIR'] = os.path.realpath(spark_conf)

        os.environ['PYSPARK_PYTHON'] = sys.executable

        try: 
            import findspark; findspark.init()
            from pyspark import SparkContext, SparkConf
        except ImportError: 
            raise ImportError("Unable to find pyspark -- are you sure SPARK_HOME is set?")

        conf = SparkConf()

        conf.set('spark.driver.maxResultSize', '0')

        if executor_memory is None: 
            executor_memory = '%dM'%self.memory_per_executor

        conf.set('spark.executor.memory', executor_memory)

        if profiling: 
            conf.set('spark.python.profile', 'true')
        else:
            conf.set('spark.python.profile', 'false')
        
        if extra_conf is not None: 
            for k,v in extra_conf.items(): 
                conf.set(k,v)

        sc = SparkContext(master=self.master_url(), conf=conf)

        return sc

Example #12

0

Show file

File: tweet_scanner.py Project: alt-code/AutoSpark

def main():
    # Setting the cluster configuration parameters
    conf = SparkConf()
    conf.setMaster("spark://localhost:7077")
    conf.setAppName("Tweet App")
    conf.set("spark.executor.memory", "3g")
    conf.set("spark.driver.memory", "4g")

    # Creating a Spark Context with conf file
    sc = SparkContext(conf=conf)

    # Creating and SQL context to perform SQL queries
    sqlContext = SQLContext(sc)

    # Define the data path
    curr_path = os.path.dirname(os.path.abspath(__file__))
    json_name = "out.json"

    json_file_path = os.path.join(curr_path +
                                  "/../Spark_Jobs/data/",
                                  json_name)

    parquet_file_path = createSQLContext(json_file_path, sqlContext)
    print(parquet_file_path)

    # Read from parquet file
    parquetFile = sqlContext.read.parquet(parquet_file_path)
    parquetFile.registerTempTable("tweets")
    counter = sqlContext.sql("SELECT count(*) as cnt FROM tweets")
    print("============= Count =================")
    print("Count:: " + str(counter.collect()[0].cnt))

Example #13

0

Show file

File: kafka-direct.py Project: blair1/hadoop-spark

def start():
    sconf = SparkConf()
    sconf.set('spark.cores.max', 2)
    sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
    ssc = StreamingContext(sc, 2)

    brokers = "localhost:9092"
    topics = ['test']

    kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})

    lines1 = kafkaStreams_lines.map(lambda x: x[1])  # 注意 取tuple下的第二个即为接收到的kafka流

    words = lines1.flatMap(lambda line: line.split(" "))

    pairs = words.map(lambda word: (word, 1))

    wordcounts = pairs.reduceByKey(lambda x, y: x + y)

    print(wordcounts)

    kafkaStreams_lines.transform(storeOffsetRanges).foreachRDD(printOffsetRanges)

    wordcounts.pprint()
    # 统计生成的随机数的分布情况
    ssc.start()  # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate

Example #14

0

Show file

File: ODHD.py Project: fchgithub/OriginPySparkRepository

def main():
    spark_conf = SparkConf().setAppName("Different-Sampling data").setMaster('local[*]')
    spark_conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    sc = SparkContext(conf= spark_conf)
    GA.logInConsole(0, "input file read!")
    rdd = sc.textFile("/home/fatemeh/Data/saveData.txt",  minPartitions= 500, use_unicode=False)
    rdd.unpersist()
#     print('\nNumber of Partitions for this run: ', rdd.getNumPartitions())
    vectorRDD = rdd.map(lambda line: toVector(line, splitter = ' '))
    
    GA.logInConsole(0 , "Data Vectorized!")
    ss = list()
    GA.logInConsole(-1, 'Start the ensemble')
    GA.logInConsole(-10, "GA with range 3")
    ss.append(GA.parallel_GA_main(vectorRDD,sc, 5))
#     GA.logInConsole(-10, "GA with range 4")
#     ss.append(GA.parallel_GA_main(norm,sc, 4))
#     GA.logInConsole(-10, "GA with range 5")
#     ss.append(GA.parallel_GA_main(norm,sc, 5))
#     GA.logInConsole(-10, "GA with range 3 and Sampled data set")
#    sampleRDD = norm.sample(False, 0.6, seed=10)
#    ss.append(GA.parallel_GA_main(sampleRDD,sc, 3))
    print(ss)
    #selectedSS = voted_subsapces(ss)
#     SSD.outlierDetection(vectorRDD, ss)
    GA.logInConsole(100, "\nend of program")
    sc.stop()

Example #15

0

Show file

File: test_broadcast.py Project: Brett-A/spark

 def _test_broadcast_on_driver(self, *extra_confs):
     conf = SparkConf()
     for key, value in extra_confs:
         conf.set(key, value)
     conf.setMaster("local-cluster[2,1,1024]")
     self.sc = SparkContext(conf=conf)
     bs = self.sc.broadcast(value=5)
     self.assertEqual(5, bs.value)

Example #16

0

Show file

File: spark_context.py Project: kernc/Orange3-Spark

class OWSparkContext(SharedSparkContext, widget.OWWidget):
    priority = 0
    name = "Context"
    description = "Create a shared Spark (sc) and Hive (hc) Contexts"
    icon = "../icons/spark.png"

    want_main_area = False
    resizing_enabled = True

    conf = None

    def __init__(self):
        super().__init__()

        # The main label of the Control's GUI.
        # gui.label(self.controlArea, self, "Spark Context")

        self.conf = SparkConf()
        all_prefedined = dict(self.conf.getAll())
        # Create parameters Box.
        box = gui.widgetBox(self.controlArea, "Spark Application", addSpace = True)

        self.gui_parameters = OrderedDict()

        main_parameters = OrderedDict()
        main_parameters['spark.app.name'] = 'OrangeSpark'
        main_parameters['spark.master'] = 'yarn-client'
        main_parameters["spark.executor.instances"] = "8"
        main_parameters["spark.executor.cores"] = "4"
        main_parameters["spark.executor.memory"] = "8g"
        main_parameters["spark.driver.cores"] = "4"
        main_parameters["spark.driver.memory"] = "2g"
        main_parameters["spark.logConf"] = "false"
        main_parameters["spark.app.id"] = "dummy"

        for k, v in main_parameters.items():
            default_value = all_prefedined.setdefault(k, v)
            self.gui_parameters[k] = GuiParam(parent_widget = box, label = k, default_value = v)
            all_prefedined.pop(k)

        for k, v in all_prefedined.items():
            self.gui_parameters[k] = GuiParam(parent_widget = box, label = k, default_value = str(v))

        action_box = gui.widgetBox(box)
        # Action Button
        self.create_sc_btn = gui.button(action_box, self, label = 'Submit', callback = self.create_context)

    def onDeleteWidget(self):
        if self.sc:
            self.sc.stop()

    def create_context(self):

        for key, parameter in self.gui_parameters.items():
            self.conf.set(key, parameter.get_value())

        self.sc = SparkContext(conf = self.conf)
        self.hc = HiveContext(self.sc)

Example #17

0

Show file

File: spark_execution.py Project: cequencer/qb

def create_spark_context(app_name="Quiz Bowl", lm_memory=False, profile=False):
    spark_conf = SparkConf()
    if lm_memory:
        pass
        # spark_conf = spark_conf.set('spark.max.cores', 30).set('spark.executor.cores', 30)
    if profile:
        spark_conf = spark_conf.set('spark.python.profile', True)
    spark_conf = spark_conf.set('spark.akka.frameSize', 300)
    return SparkContext(appName=app_name, master=QB_SPARK_MASTER, conf=spark_conf)

Example #18

0

Show file

File: ParallelGADisributed.py Project: fchgithub/OriginPySparkRepository

def main():
    spark_conf = SparkConf().setAppName("Different-Sampling data")
    spark_conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    sc = SparkContext(conf= spark_conf)
    rdd = load_data(sc)  
    print(rdd.getNumPartitions())
    parallel_GA_main(sc, rdd, 5)
    
    sc.stop()

Example #19

0

Show file

File: server.py Project: brainheshmat/spark

def init_spark_context():
    # load spark context
    conf = SparkConf().setAppName("ctr-server")
    conf.set('spark.kryoserializer.buffer', '512mb')
    conf.set('spark.kryoserializer.buffer.max', '512')
    # IMPORTANT: pass aditional Python modules to each worker
    sc = SparkContext(conf=conf, pyFiles=['/home/ec2-user/engine.py', '/home/ec2-user/app.py'])
 
    return sc

Example #20

0

Show file

File: search_job.py Project: jsmckenzie/SM_distributed

    def _configure_spark(self):
        logger.info('Configuring Spark')
        sconf = SparkConf()
        for prop, value in self.sm_config['spark'].iteritems():
            if prop.startswith('spark.'):
                sconf.set(prop, value)

        self.sc = SparkContext(master=self.sm_config['spark']['master'], conf=sconf, appName='SM engine')
        if not self.sm_config['spark']['master'].startswith('local'):
            self.sc.addPyFile(join(local_path(proj_root()), 'sm.zip'))

Example #21

0

Show file

File: utils.py Project: hobertbush/spark-cloudant

def createSparkConf():
	from pyspark import SparkConf
	test_properties = conftest.test_properties()

	conf = SparkConf()
	conf.set("cloudant.host", test_properties["cloudanthost"])
	conf.set("cloudant.username", test_properties["cloudantusername"])
	conf.set("cloudant.password", test_properties["cloudantpassword"])
	
	return conf

Example #22

0

Show file

File: main.py Project: wawanco/datascience

def main():
    # Spark Configurations
    conf = SparkConf()
    conf.set("spark.master", "local[*]")
    conf = conf.setAppName('Learning PySpark')
    sc = SparkContext(conf=conf)
    df = sc\
        .textFile('IXQ_20170622080001.csv')\
        .map(lambda line: line.split(','))
    print(df.take(5))

Example #23

0

Show file

File: server.py Project: debjyoti385/quakeanalysis

def init_spark_context():
    # load spark context
    conf = SparkConf().setAppName("event-contour-server")
    conf.setMaster("local[4]")
    conf.setAppName("reduce")
    conf.set("spark.executor.memory", "4g")
    # IMPORTANT: pass aditional Python modules to each worker
    sc = SparkContext(conf=conf, pyFiles=['app.py', 'contourGenerator.py','EventParallelize.py'])
 
    return sc

Example #24

0

Show file

File: views.py Project: calhank/reddiculous

def home3(request):
	#spark_home = os.environ['SPARK_HOME'] = '/usr/local/spark-1.5.2-bin-2.7.1/' #'/usr/local/spark/'
	#sys.path.insert(0,os.path.join(spark_home,'python'))
	#sys.path.insert(0,os.path.join(spark_home,'python/lib/py4j-0.8.2.1-src.zip'))
	
	#from pyspark import SparkContext, SparkConf	
	#sc = SparkContext()
	#data=[1,2,3,4,5]
	#distData = sc.parallelize(data)
	#first = distData.take(1)
	#sc.stop()

	prefs = ["worldnews","politics","Economics","Libertarian"]
	
	scfg=SparkConf()
	scfg.set("spark.cores.max",64)
        sc=SparkContext(master="spark://final-gateway:7077", appName="reddit-cf", conf=scfg)

        #data=[1,2,3,4,5]
        #distData = sc.parallelize(data)
        #first = distData.take(1)
        #sc.stop()

	try:

                # prep data
                raw_counts = sc.textFile("hdfs://final-gateway/w251_cf-user-site-total")
                parsed_counts = raw_counts.map(lambda st: eval(st))
                all_ratings = parsed_counts.map( tup_to_rating )
                # assign user-identified preferred subreddits
                raw_prefs = [ (999, x, 100) for x in prefs ]
                my_prefs = sc.parallelize(raw_prefs).map(tup_to_rating)

                # train model
                model_input = all_ratings.union(my_prefs)
                #model = ALS.trainImplicit(model_input, 10, 10, alpha=.01)

                # candidate prefs for prediction
                #my_prefs_ids = set([javahash(x) for x in prefs])
                #all_subreddit_ids = parsed_counts.map( lambda (a,b,c): (javahash(b),b) ).distinct().cache()
                #candidates = all_subreddit_ids.map(lambda (a,b): a ).filter( lambda r: r not in my_prefs_ids)

                #predictions = model.predictAll(candidates.map( lambda x: (999, x))).cache()

                #final = predictions.map(lambda (a,b,c): (b,c)).join(all_subreddit_ids).map(lambda (b,(c,d)): (c,d) ).sortByKey(False)

                #output = list( final.take(30) )
                sc.stop()
                #return output
                recommends = ["asfd"] # output
        except Exception, e:
                print("App failed. Stopping gracefully")
                sc.stop()
                raise Exception(e)

Example #25

0

Show file

File: smalldata.py Project: abhinavrungta/SNC-WEB

 def init(self):
     os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
     # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
     # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
     conf = SparkConf()
     conf.setMaster("local[10]")
     conf.setAppName("PySparkShell")
     conf.set("spark.executor.memory", "2g")
     conf.set("spark.driver.memory", "1g")
     self.sc = SparkContext(conf=conf)
     self.sqlContext = SQLContext(self.sc)

Example #26

0

Show file

File: test_broadcast.py Project: JkSelf/spark

 def _test_multiple_broadcasts(self, *extra_confs):
     """
     Test broadcast variables make it OK to the executors.  Tests multiple broadcast variables,
     and also multiple jobs.
     """
     conf = SparkConf()
     for key, value in extra_confs:
         conf.set(key, value)
     conf.setMaster("local-cluster[2,1,1024]")
     self.sc = SparkContext(conf=conf)
     self._test_encryption_helper([5])
     self._test_encryption_helper([5, 10, 20])

Example #27

0

Show file

File: 2.2.TopDestinationsByAirport.py Project: karthikBG/AviationAnalytics

def main():
    conf = SparkConf()
    conf.setAppName("TopAirports")
    conf.set("spark.streaming.kafka.maxRatePerPartition", "0")
    conf.set("spark.dynamicAllocation.enabled", "true")
    sc = SparkContext(conf = conf)
    ssc = StreamingContext(sc, 1) # Stream every 1 second
    ssc.checkpoint("checkpoint")

    # Clear the cassandra table
    init_cassandra().execute('TRUNCATE {}'.format(top_airports_table))

    stream_kafka(ssc)

Example #28

0

Show file

File: __init__.py Project: drelu/SAGA-Hadoop

 def __connected_spark_cluster(self, resource_url, pilot_description=None):
     conf = SparkConf()
     conf.setAppName("Pilot-Spark")
     if pilot_description!=None:
         for i in pilot_description.keys():
             if i.startswith("spark"):
                 conf.set(i, pilot_description[i])
     conf.setMaster(resource_url)
     print(conf.toDebugString())
     sc = SparkContext(conf=conf)
     sqlCtx = SQLContext(sc)
     pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx)
     return pilot

Example #29

0

Show file

File: startspark.py Project: ScartleRoy/tutorials

def create_spark_instance(master = "local", conf = None):
	"""
	master: default "local"
	conf: default 28 cores with 2g memory
	"""
	if not conf:
		conf = SparkConf()
		conf.set("spark.executor.memory", "2g")
		conf.set("spark.cores.max", "28")
		conf.setAppName("spark ipython notebook")

	spark_context = SparkContext(master, conf = conf)
	return spark_context

Example #30

0

Show file

File: firestarter.py Project: acairo/FireStarter

class FireStarter():

  mappings = {
    'http_api': readers.HttpApi,
    'lighter': igniters.Lighter,
    'hdfs': writers.HadoopFileSystem
  }

  def __init__(self, config_file):
    self.config_file = config_file

  def read_config_file(self):
    with open(self.config_file, 'r+') as config_data:
      self.config_data = config_data.read()

  def parse_config_contents(self):
    self.config = json.loads(self.config_data)
    check_requirements = required_config - frozenset(self.config.keys())
    if check_requirements:
      raise ValueError('%s must contain %s' % (self.config_file, ', '.join(check_requirements)))

  def load_modules(self):
    """This loop initializes all of the readers,
    writers, and igniters then stores them in an array"""
    self.modules = OrderedDict()
    self.data = OrderedDict()

    for module in self.config['modules']:
      # Access the module via name, or by order
      new_module = self.modules[module['name']] = self.mappings[module['type']](**module['parameters'])
      self.data[module['name']] = new_module.data

  def create_spark_context(self):
    conf = self.config['spark_conf']
    self.spark_config = SparkConf()
    self.spark_config.setAppName(conf['app_name'])
    for attribute, value in conf['parameters'].items():
        self.spark_config.set(attribute, value)

    self.sc = SparkContext(conf = self.spark_config)

  def run_modules(self):
    for name, module in self.modules.items():
      module.execute()

  def execute(self):
    self.read_config_file()
    self.parse_config_contents()
    self.load_modules()
    self.run_modules()

Example #31

0

Show file

File: OrphanPagesSpark.py Project: sjpatel7/MP2Spark_py

#!/usr/bin/env python
import sys
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("OrphanPages")
conf.set("spark.driver.bindAddress", "127.0.0.1")
sc = SparkContext(conf=conf)
lines = sc.textFile(sys.argv[1], 1)


#TODO
def getPages(line):
    line = line.rstrip()
    lines = line.split(":")
    p = lines[0].strip('\t\r\n\0 ')
    c = lines[1].strip('\t\r\n\0').split(" ")
    for val in c:
        if not val.isdigit():
            c.remove(val)
    res = [int(p)] + list(map(int, c))
    res[0] = -res[0]
    return res


def getVal(page):
    if page < 0:
        return (abs(page), 1)  #possible orphan
    else:
        return (page, 0)  #child

orphans = lines.flatMap(lambda line: getPages(line)) \
                .map(lambda p: getVal(p)) \

Example #32

0

Show file

    denom_ = math.sqrt(len(bus_features)) * math.sqrt(len(user_features))
    return num_ / denom_


if __name__ == "__main__":
    start = time.time()
    # input_fp = sys.argv[1]
    # model_fp = sys.argv[2]
    # output_fp = sys.argv[3]

    input_fp = "./data/test_review.json"
    model_fp = "./data/task2.model"
    output_fp = "./data/task2.predict"

    conf = SparkConf()
    conf.set("spark.executor.memory", "8g")
    conf.set("spark.driver.memory", "8g")
    sc = SparkContext(conf=conf)

    # Read in test data:
    test_rdd = sc.textFile(input_fp)
    user_business_rdd = test_rdd \
        .map(lambda x: json.loads(x)) \
        .map(lambda u_b: (u_b["user_id"], u_b["business_id"]))

    # Read in model components:
    model = sc.textFile(model_fp)
    business_profiles = model \
        .map(lambda x: json.loads(x)) \
        .map(lambda x: x["business"]) \
        .flatMap(lambda x: [(k["business_id"], k["feature_vector"]) for k in x]) \

Example #33

0

Show file

from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
import pickle
from imutils.face_utils import FaceAligner
from imutils.face_utils import rect_to_bb
import numpy as np
import imutils
import dlib
import cv2
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

conf = SparkConf().setAppName("drunk streaming v2").setMaster("yarn")
conf.set("spark.scheduler.mode", "FAIR")
conf.set("spark.scheduler.allocation.file",
         "/opt/spark-2.4.3-bin-hadoop2.7/conf/fairscheduler.xml")
sc = SparkContext(conf=conf)
sc.setLocalProperty("spark.scheduler.pool", "pool2")
ssc = StreamingContext(sc, 0.2)
sql_sc = SQLContext(sc)
input_topic = 'input'
output_topic = 'output2'
brokers = "G01-01:2181,G01-02:2181,G01-03:2181,G01-04:2181,G01-05:2181,G01-06:2181,G01-07:2181,G01-08:2181," \
          "G01-09:2181,G01-10:2181,G01-11:2181,G01-12:2181,G01-13:2181,G01-14:2181,G01-15:2181,G01-16:2181"


def my_decoder(s):
    return s

Example #34

0

Show file

        d['count'] = 1
        d = d.groupby(col,as_index=False)['count'].sum()
        d = d[d['count']>=10]
        freqfeas[col] = set(d[col])

def logFun(x):
    #x = int(1000*x)
    x = int(x)
    if x<2:
        return "sp"+str(x)
    else:
        return str(int(math.log(float(x))**2))

os.environ["SPARK_HOME"] = "/home/hadoop/spark-2.0.1-bin-hadoop2.7"   #KeyError: 'SPARK_HOME'
conf = SparkConf()
conf.set("spark.hadoop.validateOutputSpecs", "false")
conf.setMaster('spark://master:7077')
sc=SparkContext(appName='Tpai',conf=conf)
sc.setLogLevel('warn')
sqlContext = SQLContext(sc)


train = sqlContext.read.format('com.databricks.spark.csv') \
    .options(header='true', charset="utf-8") \
    .load('hdfs://192.168.1.118:9000/home/hadoop/dup/train_xgb113U.csv')

df = sqlContext.read.format('com.databricks.spark.csv') \
    .options(header='true', charset="utf-8") \
    .load('hdfs://192.168.1.118:9000/home/hadoop/dup/train_xgb113U_df.csv')

y = train[['label']]

Example #35

0

Show file

import re
import sys


def part2(context):
    file = context[0]
    words = re.sub('[^a-z0-9]+',' ',context[1].lower()).split()
    file = file.split("/")[-1]
    return (file,words)


#configuring spark
conf = SparkConf()
conf.setAppName( "part2_uni" )
conf.set("spark.executor.memory", "2g")
sc = SparkContext(conf = conf)

#reading input
lines =sc.wholeTextFiles("/cosc6339_s17/books-longlist/")
#configuring SparkSession
spark=SparkSession(sc)
hasattr(lines, "toDF")

#tokeinizing the words and converting into dataframes
tokenize=lines.map(part2).toDF(["bookname", "words"])

#converting into unigrams
unigram = NGram(n=1, inputCol = "words", outputCol = "unigrams")
unigramdataframe = unigram.transform(tokenize)

Example #36

0

Show file

File: g2e1.py Project: utkarsh5k/cloud-capstone

    return (new_vals0 + last_vals0,\
            new_vals1 + last_vals1)


######
###### Main script #######
######

signal.signal(signal.SIGINT, signal_handler)

dynamo = dynamodb2.connect_to_region(AWS_REGION)
out_table = Table(DB_TABLE, connection=dynamo)

config = SparkConf()
config.set('spark.streaming.stopGracefullyOnShutdown', True)
#config.set('spark.yarn.executor.memoryOverhead', '2g')

sc = SparkContext(appName='g2ex1', conf=config, pyFiles=['flight.py'])
ssc = StreamingContext(sc, 1)
ssc.checkpoint('/tmp/g2ex1')

lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))

filtered = lines.map(lambda line: line.split(","))\
                .map(lambda fields: Flight(fields))\
                .filter(lambda fl: fl.Cancelled == 0)\
                .map(lambda fl: ((fl.Origin, fl.UniqueCarrier), (fl.DepDelay, 1)))\
                .updateStateByKey(updateFunction)

filtered.foreachRDD(lambda rdd: rdd.foreachPartition(save_partition))

Example #37

0

Show file

File: prepare_app_installment_stats.py Project: marc233/fintell_app

    records = spark.sql(sql)
    return records


def transform_to_row(t):
    app_package, status = t[0].split('sweeroty')
    return Row(app_package=app_package, status=int(status), count=int(t[1]))


if __name__ == '__main__':
    print('====> Initializing Spark APP')
    localConf = RawConfigParser()
    localConf.read('../config')
    sparkConf = SparkConf()
    for t in localConf.items('spark-config'):
        sparkConf.set(t[0], t[1])
    spark = SparkSession.builder \
      .appName('RLab_Stats_Report___Prepare_APP_Installment_Stats') \
      .config(conf=sparkConf) \
      .enableHiveSupport() \
      .getOrCreate()
    sc = spark.sparkContext
    sc.setLogLevel('ERROR')

    print('====> Parsing local arguments')
    parser = argparse.ArgumentParser()
    parser.add_argument('--query_month', type=str)
    args = parser.parse_args()
    fr = args.query_month + '01'
    to = args.query_month + str(
        monthrange(int(args.query_month[:4]), int(args.query_month[4:]))[1])

Example #38

0

Show file

File: part_c_2.py Project: jvss1996/Avro-and-Parquet

from pyspark import SparkConf, SparkContext
from operator import add
from pyspark.sql import SQLContext
from nltk.corpus import stopwords
import re
import string
import nltk
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

conf = SparkConf()
conf.setAppName("Inverted Index")
conf.set("spark.ui.port", "4091")
conf.set("spark.executor.memory", "2g")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
spark = SparkSession(sc)

path1 = spark.read.load(
    "/bigd45/out312/part-00000-16a6ace5-c303-44c5-aa2e-9b2a0f4259ac-c000.snappy.parquet"
)

removePunct = (lambda x: x not in string.punctuation)
path = "/cosc6339_hw2/gutenberg-500/"
finalWords = []
out = path1.collect()
for (count, word) in out:
    out1 = word
    finalWords.append(out1)

rdd = sc.wholeTextFiles(path)

Example #39

0

Show file

File: word_count_with_threshold.py Project: yogeshkumark/pyspark-algorithms

    print(pairs_rdd.collect())

    frequencies_rdd = pairs_rdd.reduceByKey(lambda a, b: a + b)
    print(frequencies_rdd.collect())

    # filter out words with fewer than threshold occurrences
    filtered_rdd = frequencies_rdd.filter(lambda
                                          (word, count): count >= threshold)
    print(filtered_rdd.collect())


if __name__ == '__main__':

    conf = SparkConf()
    conf.setAppName("WordCount")
    conf.set('spark.executor.memory', '500M')
    conf.set('spark.cores.max', 4)
    try:
        sc = SparkContext(conf=conf)
    except:
        print("Failed to connect!")
        print(sys.exc_info()[0])

    #   sys.argv[0] is the name of the script.
    #   sys.argv[1] is the first parameter: filename
    #   sys.argv[2] is the second parameter: threshold
    input_path = sys.argv[1]  # "file:///Users/mparsian/sample.txt"
    print("input_path: {}".format(input_path))

    # get threshold
    threshold = int(sys.argv[2])

Example #40

0

Show file

from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

# Create a spark configuration
conf = SparkConf()

# set client
conf.setMaster('local')
# set app name
conf.setAppName("Some spark")
# spark config
conf.set("spark.cores.max", "1")
# spak config
conf.set("spark.executor.memory", "1g")

# Create spark context
sc = SparkContext(conf=conf)

# Create a labeled point with a positive label and a dense feature vector.
pos = LabeledPoint(1.0, [1.0, 0.0, 3.0])

# Create a labeled point with a negative label and a sparse feature vector.
neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0]))

print neg

Example #41

0

Show file

File: lda_spark_untuned.py Project: zabihimayvan/Pits_lda

                                      outputCol="features")

    pipeline = Pipeline(stages=[tokenizer, countVectorizer])
    model = pipeline.fit(df)
    documents = model.transform(df).select("features").rdd.map(
        lambda x: x.features).zipWithIndex().map(lambda x: [x[1], x[0]])
    return documents, model.stages[1].vocabulary


if __name__ == '__main__':
    start_time1 = time.time()
    args = sys.argv
    sconf = SparkConf()
    sconf.setAppName("lda")
    sconf.setMaster(args[1])
    sconf.set("spark.executor.memory", "6g")
    sconf.set("spark.driver.memory", "6g")
    sconf.set("spark.driver.maxResultSize", "6g")
    sconf.set("spark.yarn.executor.memoryOverhead", "2g")
    sconf.set("spark.yarn.driver.memoryOverhead", "2g")

    sconf.set("spark.eventLog.enabled", "true")
    sconf.set("spark.eventLog.dir",
              "hdfs://" + args[3] + "/user/" + args[4] + "/Logs/")
    sc = SparkContext(conf=sconf)
    dataset = "hdfs://" + args[3] + "/user/" + args[4] + "/In/" + args[2]
    corpus, vocabArray = preprocess(sc,
                                    path=dataset,
                                    vocabsize=50000,
                                    stopwordfile='')

Example #42

0

Show file

File: wordcount.py Project: h2222/learning

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author: [email protected]


from pyspark import SparkConf, SparkContext
import os
os.environ['PYSPARK_PYTHON'] = '/export/conda3/envs/pyspark-exp/bin/python3'

if __name__ == "__main__":
    ## 配置spark 参数
    conf = SparkConf()

    conf.set('spark.master', "local[*]") # yarn
    conf.set('spark.app.name', "word_count")

    ## 创建sc对象
    sc = SparkContext(conf=conf)

    # hello spark
    # hello hadoop
    # hello flink
    file_rdd = sc.textFile("hdfs:///data/word.txt")

    ## 打平, 得到存储单词集合的对象 :['hello', 'spark', 'hello', 'hadoop', 'hello', 'flink']
    words_rdd = file_rdd.flatMap(lambda line: line.split(" "))
    print(words_rdd.collect())

    ## 给每个对象map一个函数 :[('hello', 1), ('spark', 1), ('hello', 1), ('hadoop', 1), ('hello', 1), ('flink', 1)]
    map_rdd = words_rdd.map(lambda x: (x, 1))
    print(map_rdd.collect())

Example #43

0

Show file

from pyspark import SparkConf
from pyspark.sql import SparkSession
from operator import add
import sys
from pyspark import SparkContext

conf = SparkConf()
conf.setAppName('Assignment3')
conf.set('spark.executor.memory', '2g')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)


def function1(param):
    a = param[0]
    b = param[1]
    w = a[1] * b[1]
    return (a[0], b[0], w)


def function2(param):
    list = []
    for i in range(len(param)):
        for j in range(i + 1, len(param)):
            bc = (param[i], param[j])
            list.append(bc)

    return list


df1 = spark.read.format("com.databricks.spark.avro").load(

Example #44

0

Show file

File: 7_word_count.py Project: scchy/My_Learn

# author(learning): Scc_hy
# original url: https://github.com/mahmoudparsian/pyspark-algorithms/blob/master/code/chap03/word_count.py
# create date: 2019-12-19
# function: word_count
# data:

import sys, os
from pyspark import SparkContext, SparkConf


def wordcount(sc: SparkContext, input_path: str) -> str:
    rdd = sc.textFile(input_path)
    word_rdd = rdd.flatMap(lambda l: l.split(' '))
    pair_rdd = word_rdd.map(lambda word: (word, 1))
    print(pair_rdd.reduceByKey(lambda a, b: a + b).collect())


if __name__ == '__main__':
    spk_conf = SparkConf()
    spk_conf.setAppName('WordCount').set('spark.executor.memory', '500M')
    spk_conf.set('spark.cores.max', 4)
    try:
        sc = SparkContext(conf=spk_conf)
        input_path = r'D:\My_Learn\pyspark\chap1\sample.txt'
    except:
        print("Failed to connect!")
        print(sys.exc_info()[0])

    # Execute word count
    wordcount(sc, input_path)

Example #45

0

Show file

File: MovieRecommender_spark.py Project: somaprakash1/MovieRecommender_ApacheSpark

#imported the required packages
from pyspark import SparkContext, SparkConf
import numpy
from scipy import spatial

conf = SparkConf()
conf.setAppName('MovieRecommender')
conf.set("spark.executor.memory", "4g")
conf.set("spark.driver.memory", "4g")
conf.set("spark.eventLog.enabled", "true")
conf.set("spark.serializer ", "org.apache.spark.serializer.KryoSerializer")
sc = SparkContext(conf=conf)

#created rdd on movies and ratings dataset
movies_data = sc.textFile("/home/hadoop/movies.csv").map(
    lambda x: x.split(",")).map(lambda x: (x[0], x[1]))
ratings_data = sc.textFile("/home/hadoop/ratings.csv").map(
    lambda x: x.split(",")).map(lambda x: (x[1], (x[0], x[2])))

#got the userid,(movie title,rating) from two datasets
combineddata = movies_data.join(ratings_data)
requireddata = combineddata.map(lambda x: (x[1][1][0],
                                           (x[1][0], x[1][1][1]))).cache()

#used self join on the rdd
joinedRatings = requireddata.join(requireddata)


#to remove duplicates
def removeDuplicates(ratingvalues):
    ratings = ratingvalues[1]

Example #46

0

Show file

    max_request_threads = webconfig.getint("global",
                                           "server.max_simultaneous_requests")
    log.info("Initializing request ThreadPool to %s" % max_request_threads)
    request_thread_pool = ThreadPool(processes=max_request_threads)

    spark_context = None
    for clazzWrapper in NexusHandler.AVAILABLE_HANDLERS:
        if issubclass(clazzWrapper.clazz(), NexusHandler.SparkHandler):
            if spark_context is None:
                from pyspark import SparkContext, SparkConf

                # Configure Spark
                sp_conf = SparkConf()
                sp_conf.setAppName("nexus-analysis")
                sp_conf.set("spark.scheduler.mode", "FAIR")
                sp_conf.set("spark.executor.memory", "6g")
                spark_context = SparkContext(conf=sp_conf)

            handlers.append((clazzWrapper.path(), ModularNexusHandlerWrapper,
                             dict(clazz=clazzWrapper,
                                  algorithm_config=algorithm_config,
                                  sc=spark_context,
                                  thread_pool=request_thread_pool)))
        else:
            handlers.append((clazzWrapper.path(), ModularNexusHandlerWrapper,
                             dict(clazz=clazzWrapper,
                                  algorithm_config=algorithm_config,
                                  thread_pool=request_thread_pool)))

    class VersionHandler(tornado.web.RequestHandler):

Example #47

0

Show file

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import time
import os, sys
import logging

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

reload(sys)
sys.setdefaultencoding('utf-8')

conf = SparkConf().set('spark.driver.maxResultSize', '30g')
conf.set('spark.yarn.am.cores', 5)
conf.set('spark.executor.memory', '15g')
conf.set('spark.executor.instances', 30)
conf.set('spark.executor.cores', 8)
conf.set('spark.executor.extraJavaOptions',
         '-XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UseG1GC')

spark = SparkSession \
    .builder \
    .config(conf=conf) \
    .enableHiveSupport() \
    .getOrCreate()

from jg_info import vertex_table_info, edge_info

path_prefix = '/phoebus/_fileservice/users/slmp/shulianmingpin/midfile/open_phone'

Example #48

0

Show file

File: filterJoin.py Project: Yunlily/Insight-Project

from bisect import *
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark import StorageLevel

reload(sys)
sys.setdefaultencoding('utf-8')

if __name__ == "__main__":
    # $example on:init_session$
    
    conf = SparkConf().setAppName("CreditCardInfo")
    conf.set('spark.cores.max',60)
    conf.set('spark.executor.memory','5g')
    conf.set('spark.rpc.askTimeout',240)
    conf.set('spark.driver.memory','20g')
    conf.set('spark.dynamicAllocation.enabled',True)
    conf.set('spark.shuffle.service.enabled',True)
    conf.set('spark.task.maxFailures',1)
    conf.set('spark.network.timeout','600s')
    conf.set("yarn.nodemanager.vmem-check-enabled","false")
    sc = SparkContext(conf=conf)
    
    sqlContext = SQLContext(sc)

    card_table = sqlContext.read.format("jdbc").option("url","jdbc:mysql://localhost/card_db").option("driver","com.mysql.jdbc.Driver").option("dbtable","card_info").option("user","root").option("password",**********).load()
    
    card_table.createOrReplaceTempView("card_table")

Example #49

0

Show file

File: main_stage2.py Project: eames/mdmsim

    config["stage1File"] = filepath
    with open("config_{0}.json".format(index), "w") as outfile:
        json.dump(config, outfile, indent=4)
    subprocess.call(["./mdm", "config_{0}.json".format(index)])
    subprocess.call([
        "mv", "Final_MDM_{0}.root".format(index),
        "/hdfs/user/hjayatissa/geant_mdm_csi/stage2"
    ])
    subprocess.call([
        "/usr/local/hadoop/bin/hdfs", "dfs", "-chown", "hjayatissa",
        "/user/hjayatissa/geant_mdm_csi/stage2/Final_MDM_{0}.root".format(
            index)
    ])


if __name__ == "__main__":
    sconf = SparkConf().setAppName("mdm-CsI-2")
    sconf.set("spark.executor.memory", "13g")
    sconf.set("spark.python.worker.reuse", "false")
    sc = SparkContext(conf=sconf)

    sc.addFile("mdm")
    sc.addFile("config/config_isobutane_22Ne_6Li_geant_oxford.json")
    sc.addFile("run_oxf.mac")

    file_name = "hdfs://gr-gmaster.tamu.edu:9000//user/hjayatissa/geant_mdm_csi/stage1/MDM_*.root"
    lines = sc.newAPIHadoopFile(file_name, "edu.tamu.hadoop.RootInputFormat",
                                "org.apache.hadoop.io.IntWritable",
                                "org.apache.hadoop.io.Text")
    lines.foreach(lambda x: stage2(x))

Example #50

0

Show file

optimizer = 'adagrad'
loss = 'categorical_crossentropy'

addition = 0
master_port = 5000
send_port = 8000
master_port += addition
send_port += addition
print master_port
print send_port

chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabdefghijlmnqrtuwxy"
width, height, n_len, n_class = 140, 44, 6, len(chars) + 1

conf = SparkConf()
conf.set("spark.app.name", application_name)
conf.set("spark.master", master)
conf.set("spark.submit.deployMode", deploymode)
conf.set("spark.executor.cores", ` num_cores `)
conf.set("spark.executor.instances", ` num_executors `)
conf.set("spark.sql.warehouse.dir", "hdfs://master:9000/user/hive/warehouse")

###############################################################################
#from pyspark.sql import SparkSession
#sc = SparkSession.builder.master(master).appName(application_name).enableHiveSupport().getOrCreate()
#sqlContext = SQLContext(sc)
################################################################################
sc = SparkContext(conf=conf)
################################################################################

# 定义CTC模型，构造训练器

Example #51

0

Show file

#!/usr/bin/env python
# encoding: utf-8
'''
@author: fanyuexiang
@software: pycharm
@file: StreamingInit.py
@time: 2020/2/23 2:41 下午
@desc: 使用Python 初始化Spark Streaming，实现wordCount功能
'''
from pyspark.streaming import StreamingContext
from pyspark import SparkConf, SparkContext
import os

PYSPARK_PYTHON ="/Library/Frameworks/Python.framework/Versions/3.6/bin/python3"
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON

conf = SparkConf()
conf.set("spark.app.name", "init-streaming")
conf.set("spark.master", "local[2]")
sc = SparkContext(conf=conf)
streamingSc = StreamingContext(sparkContext=sc, batchDuration=1)
lines = streamingSc.socketTextStream(hostname="localhost", port=7777)
words = lines.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
count = pairs.reduceByKey(lambda x, y: x+y)
count.pprint()
streamingSc.start()
streamingSc.awaitTermination()

Example #52

0

Show file

from pyspark.sql.types import *

import sys
import time
import signal

import itertools
import cassandra

from cassandra.cluster import Cluster
from cassandra.query import named_tuple_factory
from flight import Flight
from itertools import islice, chain

config = SparkConf()
config.set("spark.streaming.stopGracefullyOnShutdown", "true")

filtered = None
ssc = None


def grouper_it(n, iterable):
    it = iter(iterable)
    while True:
        chunk_it = itertools.islice(it, n)
        try:
            first_el = next(chunk_it)
        except StopIteration:
            return
        yield itertools.chain((first_el, ), chunk_it)

Example #53

0

Show file

File: test_spark.py Project: geoHeil/splink

def spark():

    try:
        import pyspark
        from pyspark import SparkContext, SparkConf
        from pyspark.sql import SparkSession
        from pyspark.sql import types

        conf = SparkConf()

        conf.set("spark.sql.shuffle.partitions", "1")
        conf.set("spark.jars.ivy", "/home/jovyan/.ivy2/")
        conf.set("spark.driver.extraClassPath",
                 "jars/scala-udf-similarity-0.0.6.jar")
        conf.set("spark.jars", "jars/scala-udf-similarity-0.0.6.jar")
        conf.set("spark.driver.memory", "4g")
        conf.set("spark.sql.shuffle.partitions", "24")

        sc = SparkContext.getOrCreate(conf=conf)

        spark = SparkSession(sc)

        udfs = [
            ("jaro_winkler_sim", "JaroWinklerSimilarity", types.DoubleType()),
            ("jaccard_sim", "JaccardSimilarity", types.DoubleType()),
            ("cosine_distance", "CosineDistance", types.DoubleType()),
            ("Dmetaphone", "DoubleMetaphone", types.StringType()),
            ("QgramTokeniser", "QgramTokeniser", types.StringType()),
            ("Q3gramTokeniser", "Q3gramTokeniser", types.StringType()),
            ("Q4gramTokeniser", "Q4gramTokeniser", types.StringType()),
            ("Q5gramTokeniser", "Q5gramTokeniser", types.StringType()),
        ]

        for a, b, c in udfs:
            spark.udf.registerJavaFunction(a, "uk.gov.moj.dash.linkage." + b,
                                           c)
        SPARK_EXISTS = True
    except:
        SPARK_EXISTS = False

    if SPARK_EXISTS:
        print("Spark exists, running spark tests")
        yield spark
    else:
        spark = None
        logger.error("Spark not available")
        print("Spark not available")
        yield spark

Example #54

0

Show file

File: practice5_kmeans.py Project: JuyeoungJun/universityProject

    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return values[:-1]


def parseLabel(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return values[-1]


def error(point, model):
    center = model.centers[model.predict(point)]
    return math.sqrt(sum([x**2 for x in (point - center)]))


conf = SparkConf()
conf.set("spark.master", "local")
sc = SparkContext(conf=conf)

data = sc.textFile("practice6_train.csv")
trData = data.map(parseFeat)

data = sc.textFile("practice6_test.csv")
tsData = data.map(parseFeat)
tsLabel = data.map(parseLabel)

kmeans_list = []
for i in range(30):
    kmeans_list.append(KMeans.train(trData, k=10, maxIterations=100, seed=i))

obj_list = []
for i in range(30):

Example #55

0

Show file

File: context.py Project: zhouhufeng/hail

    def __init__(self,
                 sc=None,
                 app_name="Hail",
                 master=None,
                 local='local[*]',
                 log=None,
                 quiet=False,
                 append=False,
                 min_block_size=1,
                 branching_factor=50,
                 tmp_dir=None,
                 default_reference="GRCh37",
                 idempotent=False,
                 global_seed=6348563392232659379,
                 optimizer_iterations=None,
                 _backend=None):

        if Env._hc:
            if idempotent:
                return
            else:
                raise FatalError(
                    'Hail has already been initialized, restart session '
                    'or stop Hail to change configuration.')

        if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"):
            hail_jar_path = pkg_resources.resource_filename(
                __name__, "hail-all-spark.jar")
            assert os.path.exists(
                hail_jar_path), f'{hail_jar_path} does not exist'
            sys.stderr.write(f'using hail jar at {hail_jar_path}\n')
            conf = SparkConf()
            conf.set('spark.driver.extraClassPath', hail_jar_path)
            conf.set('spark.executor.extraClassPath', hail_jar_path)
            SparkContext._ensure_initialized(conf=conf)
        else:
            SparkContext._ensure_initialized()

        self._gateway = SparkContext._gateway
        self._jvm = SparkContext._jvm

        # hail package
        self._hail = getattr(self._jvm, 'is').hail

        self._warn_cols_order = True
        self._warn_entries_order = True

        Env._jvm = self._jvm
        Env._gateway = self._gateway

        jsc = sc._jsc.sc() if sc else None

        if _backend is None:
            apiserver_url = os.environ.get('HAIL_APISERVER_URL')
            if apiserver_url is not None:
                _backend = ServiceBackend(apiserver_url)
            else:
                _backend = SparkBackend()
        self._backend = _backend

        tmp_dir = get_env_or_default(tmp_dir, 'TMPDIR', '/tmp')
        optimizer_iterations = get_env_or_default(optimizer_iterations,
                                                  'HAIL_OPTIMIZER_ITERATIONS',
                                                  3)

        version = read_version_info()
        hail.__version__ = version

        if log is None:
            log = hail.utils.timestamp_path(os.path.join(os.getcwd(), 'hail'),
                                            suffix=f'-{version}.log')
        self._log = log

        # we always pass 'quiet' to the JVM because stderr output needs
        # to be routed through Python separately.
        # if idempotent:
        if idempotent:
            self._jhc = self._hail.HailContext.getOrCreate(
                jsc, app_name, joption(master), local, log, True, append,
                min_block_size, branching_factor, tmp_dir,
                optimizer_iterations)
        else:
            self._jhc = self._hail.HailContext.apply(jsc, app_name,
                                                     joption(master), local,
                                                     log, True, append,
                                                     min_block_size,
                                                     branching_factor, tmp_dir,
                                                     optimizer_iterations)

        self._jsc = self._jhc.sc()
        self.sc = sc if sc else SparkContext(
            gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc))
        self._jsql_context = self._jhc.sqlContext()
        self._sql_context = SQLContext(self.sc, jsqlContext=self._jsql_context)

        super(HailContext, self).__init__()

        # do this at the end in case something errors, so we don't raise the above error without a real HC
        Env._hc = self

        ReferenceGenome._from_config(_backend.get_reference('GRCh37'), True)
        ReferenceGenome._from_config(_backend.get_reference('GRCh38'), True)
        ReferenceGenome._from_config(_backend.get_reference('GRCm38'), True)

        if default_reference in ReferenceGenome._references:
            self._default_ref = ReferenceGenome._references[default_reference]
        else:
            self._default_ref = ReferenceGenome.read(default_reference)

        jar_version = self._jhc.version()

        if jar_version != version:
            raise RuntimeError(
                f"Hail version mismatch between JAR and Python library\n"
                f"  JAR:    {jar_version}\n"
                f"  Python: {version}")

        if not quiet:
            sys.stderr.write('Running on Apache Spark version {}\n'.format(
                self.sc.version))
            if self._jsc.uiWebUrl().isDefined():
                sys.stderr.write('SparkUI available at {}\n'.format(
                    self._jsc.uiWebUrl().get()))

            connect_logger('localhost', 12888)

            self._hail.HailContext.startProgressBar(self._jsc)

            sys.stderr.write(
                'Welcome to\n'
                '     __  __     <>__\n'
                '    / /_/ /__  __/ /\n'
                '   / __  / _ `/ / /\n'
                '  /_/ /_/\\_,_/_/_/   version {}\n'.format(version))

            if version.startswith('devel'):
                sys.stderr.write(
                    'NOTE: This is a beta version. Interfaces may change\n'
                    '  during the beta period. We recommend pulling\n'
                    '  the latest changes weekly.\n')
            sys.stderr.write(f'LOGGING: writing to {log}\n')

        install_exception_handler()
        Env.set_seed(global_seed)

Example #56

0

Show file

local_path = os.path.dirname(__file__)
sys.path.append(local_path + "/../lib")
sys.path.append(local_path + "/../")
sys.path.append(local_path)

from pyspark import SQLContext, SparkConf, HiveContext
from pyspark import SparkContext

from ml import diff_feature_cls, diff_train_cls


def run(sc, sql_context, is_hive):
    diff_feature_cls.main(sc, sql_context, is_hive=True)
    diff_train_cls.main(sc, sql_context, is_hive=True)


if __name__ == "__main__":
    conf = SparkConf()
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "32g")

    sc = SparkContext(appName="bintrade_candidate",
                      master="yarn-client",
                      conf=conf)
    sqlContext = HiveContext(sc)
    sqlContext.setConf("spark.sql.shuffle.partitions", "32")

    sqlContext.sql("use fex")

    run(sc, sqlContext, is_hive=True)

Example #57

0

Show file

from pyspark import SparkContext, SparkConf
import numpy as np
conf = SparkConf()
conf.set('master', 'spark://hadoop-maste:7077')
context = SparkContext(conf=conf)
acc = context.accumulator(0)
print(type(acc), acc.value)
rdd = context.parallelize(np.arange(101), 5)


def acc_add(a):
    acc.add(a)
    return a


rdd2 = rdd.map(acc_add)
print(rdd2.collect())
print(acc.value)
context.stop()

Example #58

0

Show file

 def getConf(self):
     conf = SparkConf()
     for k, v in DEFAULT_SPARK_CONFIG.items():
         conf.set(k, v)
     return conf

Example #59

0

Show file

def papers_citations(sc):
	#papers and number of citations per year
	citations = sc.textFile("/corpora/corpus-microsoft-academic-graph/data/PaperReferences.tsv.bz2")
	citations = citations.map(lambda line : line.split("\t")).map(lambda c: (c[0], c[1]))

	papers = papers_newer_than(sc, 2013)
	papers = papers.map(lambda p: (p[0], p[3]))


	#join
	papersMap = sc.broadcast(papers.collectAsMap());

	rowFunc1 = lambda x: (x[0], x[1], papersMap.value.get(x[1], -1))
	def mapFunc1(partition):
		for row in partition:
			yield rowFunc1(row)


	result = citations.mapPartitions(mapFunc1, preservesPartitioning=True)
	result = result.filter(lambda c: c[2] != -1).map(lambda x: (x[0], x[1]))
	result.saveAsHadoopFile('/user/bd-ss16-g3/data/citations_2014', "org.apache.hadoop.mapred.TextOutputFormat", compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")

if __name__ == "__main__":
	# Configure OPTIONS
	conf = SparkConf().setAppName(APP_NAME)
	conf = conf.setMaster("yarn-client")
	conf = conf.set("spark.executor.memory", "25g").set("spark.driver.memory", "25g").set("spark.mesos.executor.memoryOverhead", "10000")
	sc   = SparkContext(conf=conf)
	#papers_citations(sc)
	papers_with_citations(sc)

Example #60

0

Show file

File: question-2_1.py Project: rupsis/CS598-cloud-computing-capstone

        save_results(toSave)


def calculateAverage(newVal, accumlativeAvg):
    if accumlativeAvg is None:
        accumlativeAvg = (0.0, 0, 0.0)
    total = sum(newVal, accumlativeAvg[0])
    count = accumlativeAvg[1] + len(newVal)
    avg = total / float(count)
    return (prod, count, avg)


if __name__ == '__main__':
    conf = SparkConf()
    conf.setAppName("Problem_2-1")
    conf.set("spark.streaming.kafka.maxRatePerPartition", 50000)
    conf.set("spark.executor.memory", "2g")
    conf.set("spark.python.worker.memory", "1g")

    airports = ['CMI', 'BWI', 'MIA', 'LAX', 'IAH', 'SFO']

    sc = SparkContext(conf=conf)
    sc.setLogLevel("WARN")
    ssc = StreamingContext(sc, 2)
    ssc.checkpoint("/tmp/streaming")

    brokers = "b-1.cs598-tast2.n69c9p.c2.kafka.us-east-1.amazonaws.com:9092,b-2.cs598-tast2.n69c9p.c2.kafka.us-east-1.amazonaws.com:9092"
    topic = "cs598-task2 "
    kafka_consumer_group = str(uuid.uuid4())
    kafka_client_params = {
        "metadata.broker.list": brokers,