class SparkContextFactory:
  def __init__(self):
    # not sure why windows environment variable can't be read, I set it 
    ##os.environ["SPARK_HOME"] = "C:\Spark"
    # not sure why windows environment variable can't be read, I set it 
    ##os.environ["HADOOP_CONF_DIR"] = "C:\hdp\bin"
    ##sys.path.append("C:\Spark\python")
    ##sys.path.append("C:\Spark\bin")

    # specify spark home
    os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark"
    # specify pyspark path so its libraries can be accessed by this application
    sys.path.append("/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark/python")
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SQLContext

    self.conf = SparkConf().setMaster("yarn-client")
    self.conf.setAppName("MrT")
    self.conf.set("spark.executor.memory", "5g")
    self.conf.set("spark.driver.memory", "10g")

    self.sc = SparkContext(conf = self.conf, pyFiles =
    ["ComputeCovHistory.py", "go.py", "risk_DSconvert.py", "ewstats.py", "ewstatsRDD.py", "ewstatswrap.py"])

    """
    toDF method is a monkey patch executed inside SQLContext constructor
    so to be able to use it you have to create a SQLContext first
    """
    self.sqlContextInstance = SQLContext(self.sc)


  def disconnect(self):
    self.sc.stop()
Exemple #2
0
def main():
    """
    Main entry point of the application
    """

    # Create spark configuration and spark context
    include_path = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'preprocessing.py'))
    conf = SparkConf()
    conf.set('spark.executor.memory', '1500m')
    conf.setAppName("Generating predictions")
    sc = SparkContext(conf=conf, pyFiles=[include_path])

    # Set S3 configuration
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", os.environ['AWS_ACCESS_KEY'])
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", os.environ['AWS_SECRET_KEY'])

    # Single-pass predictions
    fast_predict(sc, file_input="s3n://twitter-stream-data/twitter-*",
                 file_output="s3n://twitter-stream-predictions/final",
                 sports_model="PyTwitterNews/models/sports.model",
                 politics_model="PyTwitterNews/models/politics.model",
                 technology_model="PyTwitterNews/models/technology.model")

    # Stop application
    sc.stop()
Exemple #3
0
def stackexchange_xml_spark_job():
    server = bluebook_conf.HDFS_FQDN
    conf = SparkConf()

    xml_file_address = "hdfs://" + server + "/" +\
                       bluebook_conf.STACKEXCHANGE_XML_FOLDER_NAME +\
                       bluebook_conf.STACKEXCHANGE_XML_FILE_NAME
                         
    json_ques_folder_address = "hdfs://" + server + "/" +\
                               bluebook_conf.STACKEXCHANGE_JSON_QUES_FOLDER_NAME
    json_ans_folder_address = "hdfs://" + server + "/" +\
                              bluebook_conf.STACKEXCHANGE_JSON_ANS_FOLDER_NAME
        
    conf.setAppName('stackexchange_xml_spark_job')
    spark_context = SparkContext(conf=conf)
        
    file = spark_context.textFile(xml_file_address)

    # Ques and Ans files are stored seperately depending of their 'posttypeid'
    # Ques -> posttypeid == 1
    # Ans -> posttypeid == 2
    ques = file.map(stackexchange_xml_mapper)\
               .filter(lambda dic: 'posttypeid' in dic.keys())\
               .filter(lambda dic: dic['posttypeid'] == '1')\
               .map(lambda d: jsoner(d))
    ans = file.map(stackexchange_xml_mapper)\
               .filter(lambda dic: 'posttypeid' in dic.keys())\
               .filter(lambda dic: dic['posttypeid'] == '2')\
               .map(lambda d: jsoner(d))
    ques.saveAsTextFile(json_ques_folder_address)
    ans.saveAsTextFile(json_ans_folder_address)
Exemple #4
0
def main():
    # Setting the cluster configuration parameters
    conf = SparkConf()
    conf.setMaster("spark://localhost:7077")
    conf.setAppName("Tweet App")
    conf.set("spark.executor.memory", "3g")
    conf.set("spark.driver.memory", "4g")

    # Creating a Spark Context with conf file
    sc = SparkContext(conf=conf)

    # Creating and SQL context to perform SQL queries
    sqlContext = SQLContext(sc)

    # Define the data path
    curr_path = os.path.dirname(os.path.abspath(__file__))
    json_name = "out.json"

    json_file_path = os.path.join(curr_path +
                                  "/../Spark_Jobs/data/",
                                  json_name)

    parquet_file_path = createSQLContext(json_file_path, sqlContext)
    print(parquet_file_path)

    # Read from parquet file
    parquetFile = sqlContext.read.parquet(parquet_file_path)
    parquetFile.registerTempTable("tweets")
    counter = sqlContext.sql("SELECT count(*) as cnt FROM tweets")
    print("============= Count =================")
    print("Count:: " + str(counter.collect()[0].cnt))
Exemple #5
0
    def __connected_yarn_spark_cluster(self, pilotcompute_description):

        number_cores=1
        if pilotcompute_description.has_key("number_cores"):
            number_cores=int(pilotcompute_description["number_cores"])
        
        number_of_processes = 1
        if pilotcompute_description.has_key("number_of_processes"):
            number_of_processes = int(pilotcompute_description["number_of_processes"])

        executor_memory="1g"
        if pilotcompute_description.has_key("number_of_processes"):
            executor_memory = pilotcompute_description["physical_memory_per_process"]

        conf = SparkConf()
        conf.set("spark.num.executors", str(number_of_processes))
        conf.set("spark.executor.instances", str(number_of_processes))
        conf.set("spark.executor.memory", executor_memory)
        conf.set("spark.executor.cores", number_cores)
        if pilotcompute_description!=None:
            for i in pilotcompute_description.keys():
                if i.startswith("spark"):
                    conf.set(i, pilotcompute_description[i])
        conf.setAppName("Pilot-Spark")
        conf.setMaster("yarn-client")
        sc = SparkContext(conf=conf)
        sqlCtx = SQLContext(sc)
        pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx)
        return pilot
def configureSpark():
	conf = SparkConf()
	conf.setMaster("local")
	conf.setAppName("Apache Spark Alarm Parser")
	conf.set("spark.executor.memory", "1g")
	sc = SparkContext(conf = conf)
	return sc
Exemple #7
0
def sparkconfig():
    # spark configuration options

    # conf = SparkConf()
    # conf.setMaster("spark://3.168.100.58:7077") # uncomment for standalone cluster
    # conf.setMaster("local")   # uncomment for local execution
    # conf.setAppName("demo_chain")
    # conf.set("spark.executor.memory", "2g")
    # conf.set("spark.default.parallelism", 56)  # 48)
    # conf.set("spark.sql.inMemoryColumnarStorage.compressed","true")
    # conf.set("sql.inMemoryColumnarStorage.batchSize",2000)

    # AMAZON AWS EMR
    conf = SparkConf()
    conf.setMaster("yarn-client")	#client gets output to terminals
    #conf.setMaster("yarn-cluster")	# this seems to runf aster but can't confirm
    conf.set("spark.default.parallelism",648)
    conf.setAppName("spark_markov_chain")
    conf.set("spark.executor.memory", "22g")
    conf.set("spark.executor.instances",9)
    conf.set("spark.executor.cores",9)
    conf.set("spark.yarn.executor.memoryOverhead",800)
    conf.set("spark.rdd.compress","True")
    conf.set("spark.shuffle.consolidateFiles","True")
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

    return conf
Exemple #8
0
def main(args):

    if len(args) < 2:
        sys.exit(1)

    # Setting the cluster configuration parameters
    spark_master = args[0]
    spark_data_file_name = args[1]
    file_path = CURR_DIR + "/" + spark_data_file_name

    conf = SparkConf()
    conf.setMaster(spark_master)
    conf.setAppName("Log Scanner")

    # Creating a Spark Context with conf file
    sc = SparkContext(conf=conf)

    txt_logs = sc.textFile(file_path).filter(lambda line: check(line))
    access_logs = txt_logs.map(lambda line: AccessLog(line))

    #  Getting response_codes from log objects and caching it
    response_codes = access_logs.map(lambda log: log.get_status()).cache()
    log_count = response_codes.count()
    print("Total Resonse Codes: " + str(log_count))
    cnt = response_codes.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
    response200 = cnt.filter(lambda x: x[0] == "200").map(lambda (x, y): y).collect()
    print("###########################")
    print("##  Success Rate : " + str(int(response200[0])*100/log_count) + " %  ##")
    print("###########################")
Exemple #9
0
    def spark_config(self):
        if self._spark_config is None:
            os.environ['SPARK_SUBMIT_CLASSPATH'] = ','.join(self.spex_conf.spark_config.jars)

            conf = SparkConf()
            conf.setAppName(self.spex_conf.spark_config.name)
            conf.setMaster(self.spex_conf.spark_config.master)

            conf.set('spark.rdd.compress', 'true')
            conf.set('spark.io.compression.codec', 'lz4')
            conf.set('spark.mesos.coarse',
                     'true' if self.spex_conf.spark_config.coarse_mode else 'false')

            # TODO - Setup all the other cruft as needed
            #conf.set('spark.executor.memory', '4g')
            #conf.set('spark.cores.max', '16')
            #conf.set('spark.task.cpus', '6')

            # TODO - bind port for spark web ui

            self._spark_config = conf

        config = self._spark_config

        # These are always set, if someone changes them we simply set them back
        config.set('spark.executor.uri', self.artifact_resolver(self.spex_conf.spark_distro))
        config.setExecutorEnv(key='PYSPARK_PYTHON', value='./%s daemon' % self.spex_conf.spex_name)
        return config
def read_conf():
    """
    Setting up spark contexts
    """
    conf = SparkConf()
    conf.setMaster("local[*]")
    conf.setAppName("Testing")
    return conf
Exemple #11
0
 def getSparkConf(self):
     conf = SparkConf()
     conf.setAppName(self.PROJECT_NAME)
     conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
     conf.set("spark.cleaner.ttl", self.TTL)
     # es
     conf.set("es.index.auto.create", "true")
     conf.set("es.nodes", self.ES_NODES)
     return conf
Exemple #12
0
def init_spark_context():
    # load spark context
    conf = SparkConf().setAppName("event-contour-server")
    conf.setMaster("local[4]")
    conf.setAppName("reduce")
    conf.set("spark.executor.memory", "4g")
    # IMPORTANT: pass aditional Python modules to each worker
    sc = SparkContext(conf=conf, pyFiles=['app.py', 'contourGenerator.py','EventParallelize.py'])
 
    return sc
def get_sc():
    """ Defines and returns a SparkContext from some configurations via SparkConf. """
    conf = SparkConf()
    conf.setAppName("Jon's PySpark")
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryroserializer.buffer.mb", "256")
    conf.set("spark.akka.frameSize", "500")
    conf.set("spark.akka.askTimeout", "30")
    
    return SparkContext(conf=conf)
Exemple #14
0
 def init(self):
     os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
     # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
     # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
     conf = SparkConf()
     conf.setMaster("local[10]")
     conf.setAppName("PySparkShell")
     conf.set("spark.executor.memory", "2g")
     conf.set("spark.driver.memory", "1g")
     self.sc = SparkContext(conf=conf)
     self.sqlContext = SQLContext(self.sc)        
Exemple #15
0
    def __init__(self, master, name):
        self.name=name
        self.master=master

        print "init spark ..."
        os.environ["HADOOP_HOME"]="D:\code\wqr\hadoop-common-2.2.0-bin"
        conf = SparkConf()
        conf.setMaster(self.master)
        conf.setAppName(self.name)

        self.sc = SparkContext(conf=conf)
Exemple #16
0
 def __connected_spark_cluster(self, resource_url, pilot_description=None):
     conf = SparkConf()
     conf.setAppName("Pilot-Spark")
     if pilot_description!=None:
         for i in pilot_description.keys():
             if i.startswith("spark"):
                 conf.set(i, pilot_description[i])
     conf.setMaster(resource_url)
     print(conf.toDebugString())
     sc = SparkContext(conf=conf)
     sqlCtx = SQLContext(sc)
     pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx)
     return pilot
def main():
    conf = SparkConf()
    conf.setAppName("TopAirports")
    conf.set("spark.streaming.kafka.maxRatePerPartition", "0")
    conf.set("spark.dynamicAllocation.enabled", "true")
    sc = SparkContext(conf = conf)
    ssc = StreamingContext(sc, 1) # Stream every 1 second
    ssc.checkpoint("checkpoint")

    # Clear the cassandra table
    init_cassandra().execute('TRUNCATE {}'.format(top_airports_table))

    stream_kafka(ssc)
Exemple #18
0
def create_spark_instance(master = "local", conf = None):
	"""
	master: default "local"
	conf: default 28 cores with 2g memory
	"""
	if not conf:
		conf = SparkConf()
		conf.set("spark.executor.memory", "2g")
		conf.set("spark.cores.max", "28")
		conf.setAppName("spark ipython notebook")

	spark_context = SparkContext(master, conf = conf)
	return spark_context
    def _init_spark(self, appname):
        """Internal function to setup spark context
        
        Note: only include spark modules here so that
        the interface can be queried outside of pyspark.

        """
        # currently using LZ4 compression: should not degrade runtime much
        # but will help with some operations like shuffling, especially when
        # dealing with things object like highly compressible label volumes
        # NOTE: objects > INT_MAX will cause problems for LZ4
        worker_env = {}
        if "DVIDSPARK_WORKFLOW_TMPDIR" in os.environ and os.environ["DVIDSPARK_WORKFLOW_TMPDIR"]:
            worker_env["DVIDSPARK_WORKFLOW_TMPDIR"] = os.environ["DVIDSPARK_WORKFLOW_TMPDIR"]
        
        try:
            spark_config = self.config_data["options"]["spark-config"]
        except KeyError:
            # Old workflows haven't been updated to inherit the base Workflow schema
            spark_config = {}
        
        for k in list(spark_config.keys()):
            spark_config[k] = str(spark_config[k])
            if spark_config[k] in ('True', 'False'):
                spark_config[k] = spark_config[k].lower()
            
        # Backwards compatibility:
        # if 'corespertask' option exists, override it in the spark config
        if "corespertask" in self.config_data["options"] and self.config_data["options"]["corespertask"] != 0:
            if "spark.task.cpus" in spark_config and spark_config["spark.task.cpus"] != '1':
                raise RuntimeError("Bad config: You can't set both 'corespertask' and 'spark.task.cpus'.  Use 'spark.task.cpus'.")
            spark_config["spark.task.cpus"] = str(self.config_data["options"]["corespertask"])

        # set spark config
        from pyspark import SparkContext, SparkConf
        conf = SparkConf()
        conf.setAppName(appname)
        conf.setAll(list(spark_config.items()))
        
#         from pyspark_flame import FlameProfiler
#         flamegraph_dir = f'{self.config_dir}/flamegraphs'
#         os.makedirs(flamegraph_dir, exist_ok=True)
#         conf.set("spark.python.profile.dump", flamegraph_dir)
#         conf.set("spark.python.profile", "true")
#         worker_env['pyspark_flame.interval'] = 0.25 # Default is 0.2 seconds
#         return SparkContext(conf=conf, batchSize=1, environment=worker_env, profiler_cls=FlameProfiler)

        # Auto-batching heuristic doesn't work well with our auto-compressed numpy array pickling scheme.
        # Therefore, disable batching with batchSize=1
        return SparkContext(conf=conf, batchSize=1, environment=worker_env)
Exemple #20
0
class FireStarter():

  mappings = {
    'http_api': readers.HttpApi,
    'lighter': igniters.Lighter,
    'hdfs': writers.HadoopFileSystem
  }

  def __init__(self, config_file):
    self.config_file = config_file

  def read_config_file(self):
    with open(self.config_file, 'r+') as config_data:
      self.config_data = config_data.read()

  def parse_config_contents(self):
    self.config = json.loads(self.config_data)
    check_requirements = required_config - frozenset(self.config.keys())
    if check_requirements:
      raise ValueError('%s must contain %s' % (self.config_file, ', '.join(check_requirements)))

  def load_modules(self):
    """This loop initializes all of the readers,
    writers, and igniters then stores them in an array"""
    self.modules = OrderedDict()
    self.data = OrderedDict()

    for module in self.config['modules']:
      # Access the module via name, or by order
      new_module = self.modules[module['name']] = self.mappings[module['type']](**module['parameters'])
      self.data[module['name']] = new_module.data

  def create_spark_context(self):
    conf = self.config['spark_conf']
    self.spark_config = SparkConf()
    self.spark_config.setAppName(conf['app_name'])
    for attribute, value in conf['parameters'].items():
        self.spark_config.set(attribute, value)

    self.sc = SparkContext(conf = self.spark_config)

  def run_modules(self):
    for name, module in self.modules.items():
      module.execute()

  def execute(self):
    self.read_config_file()
    self.parse_config_contents()
    self.load_modules()
    self.run_modules()
def main():
    conf = SparkConf()
    conf.setMaster('local[*]')
    conf.setAppName('spark-basic')
    sc = SparkContext(conf=conf)
    churn_df = read_dataset(sc, "churn_no_header.csv")
    pipeline = build_pipeline()
    training_data, test_data = train_test_split(churn_df, 0.2)
    model = pipeline.fit(training_data)
    predictions = model.transform(test_data)
    print predictions.show(20)

    (roc_score, pr_score) = evaluate(predictions, ['areaUnderROC', 'areaUnderPR'])
    print "\nSpark AUC Score: ", roc_score, ", PR Score: ", pr_score
Exemple #22
0
def set_up_spark():

	######################
	#
	# initialize spark
	#
	######################

	conf = SparkConf()
	conf.setAppName("Spark Test")
	conf.set('spark.shuffle.io.preferDirectBufs','false')
	sc = SparkContext(conf = conf)
	quiet_logs(sc)
	sqlContext = SQLContext(sc)
	return sqlContext,sc
def createSparkCtx (Config,name):
    logging.info ("Getting Spark Context")
    spark_location =  Config.get("connectioninfo","spark_location")
    spark_version =  Config.get("connectioninfo","spark_version")
    spark_executor_num =  Config.get("connectioninfo","spark_executor_num")
    spark_executor_mem =  Config.get("connectioninfo","spark_executor_mem")
    spark_executor_cores =  Config.get("connectioninfo","spark_executor_cores")
    spark_driver_mem =  Config.get("connectioninfo","spark_driver_mem")
    spark_auto_broadcast =  Config.get("connectioninfo","spark_auto_broadcast")

    # Depending on Spark version chose a different version of Spark on the system
    # Location is coming from the config file
    os.environ['SPARK_HOME'] = spark_location
    os.environ['PYTHONPATH'] = spark_location + "/python:"+ spark_location + "/python/lib/usr/lib/spark:$PYTHONPATH"
    os.environ['HADOOP_CONF_DIR'] = "/etc/hadoop/conf"
    os.environ['YARN_CONF_DIR'] = "/etc/hadoop/conf"
    sys.path.append(spark_location + "/python")

    from pyspark.sql import HiveContext
    from pyspark import SparkContext, SparkConf

    try:
      if spark_version == "2.0":
            from pyspark.sql import SparkSession
            sqlContext = SparkSession.builder.master("yarn").appName("Spark2 SQL Driver")\
                .config("spark.executor.instances", spark_executor_num)\
                .config("spark.executor.memory", spark_executor_mem)\
                .config("spark.driver.memory", spark_driver_mem)\
                .config("spark.executor.cores", spark_executor_cores)\
                .config("spark.sql.autoBroadcastJoinThreshold", spark_auto_broadcast)\
                .config("spark.yarn.queue", name)\
                .enableHiveSupport().getOrCreate ()
            return sqlContext
      else:
        conf = SparkConf()
        conf.setAppName("Spark1 SQL Driver")
        conf.set("spark.executor.instances", spark_executor_num )
        conf.set("spark.executor.memory", spark_executor_mem)
        conf.set("spark.executor.cores", spark_executor_cores)
        conf.set("spark.driver.memory", spark_driver_mem)
        conf.set("spark.yarn.queue", name)
        conf.set("spark.sql.autoBroadcastJoinThreshold", spark_auto_broadcast)
        sc = SparkContext (conf=conf)
        sqlContext = HiveContext(sc)
    	return sqlContext
    except Exception, e:
        logging.error ("Exception encountered: " +  str(e))
        sys.exit (1)
def main():
    global ssc

    conf = SparkConf()
    conf.setAppName("TopAirports")
    conf.set("spark.streaming.kafka.maxRatePerPartition", "0")
    conf.set('spark.streaming.stopGracefullyOnShutdown', True)

    sc = SparkContext(conf=conf)

    ssc = StreamingContext(sc, 1)  # Stream every 1 second
    ssc.checkpoint("/tmp/checkpoint")

    signal.signal(signal.SIGINT, stop_streaming)

    stream_kafka()
Exemple #25
0
def sc(cores=None, pyFiles=['back.py', 'cli.py', 'ingest.py', 'spark.py', 'text_nltk.py', 'spark.py', 'word2vec.py'], memo=None):
    if not cores:
        cores = 4
    try:
        return sc.sc
    except AttributeError:
        from pyspark import SparkConf, SparkContext
        print >>sys.stderr, "CORES: %i" % cores
        conf = SparkConf()
        conf.setAppName("Nuance/Q%s" % (" [%s]" % memo if memo else ""))
        conf.set("spark.executor.memory", "8g")
        conf.set("spark.cores.max", str(cores))
        conf.set("master.ui.port", "8082")
        conf.set("spark.ui.port", "4041")  # kicked in

        sc.sc = SparkContext(conf=conf, pyFiles=pyFiles)
        return sc.sc
def main():

    # Configure Spark
    conf = SparkConf()
    conf.setAppName("Application name")  # Specify the application name
    conf.set("spark.jars", "file:/shared_data/spark_jars/hadoop-openstack-3.0.0-SNAPSHOT.jar")  # Don't modify
    sc = SparkContext(conf=conf)  # Spark Context variable that will be used for all operations running on the cluster

    parser = argparse.ArgumentParser()
    parser.add_argument("backend", type=str)
    parser.add_argument("helperpath", type=str)
    parser.add_argument("shuffle_partitions", type=str)
    parser.add_argument("params", type=str)
    parser.add_argument("inputs", type=str)
    parser.add_argument("features", type=str, nargs='?')

    args = parser.parse_args()

    # Swift Connection
    if(args.backend == 'swift'):
        hadoopConf = sc._jsc.hadoopConfiguration()
        hadoopConf.set("fs.swift.impl", "org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem")
        hadoopConf.set("fs.swift.service.SparkTest.auth.url", os.environ['OS_AUTH_URL'] + "/tokens")
        hadoopConf.set("fs.swift.service.SparkTest.http.port", "8443")
        hadoopConf.set("fs.swift.service.SparkTest.auth.endpoint.prefix", "/")
        hadoopConf.set("fs.swift.service.SparkTest.region", os.environ['OS_REGION_NAME'])
        hadoopConf.set("fs.swift.service.SparkTest.public", "false")
        hadoopConf.set("fs.swift.service.SparkTest.tenant", os.environ['OS_TENANT_ID'])
        hadoopConf.set("fs.swift.service.SparkTest.username", os.environ['OS_USERNAME'])
        hadoopConf.set("fs.swift.service.SparkTest.password", os.environ['OS_PASSWORD'])

    helperpath = str(args.helperpath)  # This is passed by default
    sc.addFile(helperpath + "/utils/helper.py")  # To import custom modules
    shuffle_partitions = args.shuffle_partitions

    # Create a dict and pass it in your_module_implementation
    params = json.loads(args.params)
    inputs = json.loads(args.inputs)
    features = json.loads(args.features)  # Only used when you want to create a feature set

    sqlContext = SQLContext(sc)  # Create SQLContext var from SparkContext, To work with our default format of datasets i.e. Parquet
    sqlContext.setConf("spark.sql.shuffle.partitions", shuffle_partitions)  # Don't change, required for controlling parallelism

    # Pass the sc (Spark Context) and sqlContext along with the different paramters and inputs.
    module_implementation(sc, sqlContext, params=params, inputs=inputs, features=features)
def create_sc():
    sc_conf = SparkConf()
    sc_conf.setAppName("finance-similarity-app")
    sc_conf.setMaster('spark://10.21.208.21:7077')
    sc_conf.set('spark.executor.memory', '2g')
    sc_conf.set('spark.executor.cores', '4')
    sc_conf.set('spark.cores.max', '40')
    sc_conf.set('spark.logConf', True)
    print sc_conf.getAll()

    sc = None
    try:
        sc.stop()
        sc = SparkContext(conf=sc_conf)
    except:
        sc = SparkContext(conf=sc_conf)

    return sc
Exemple #28
0
def main():
    # Spark Configurations
    conf = SparkConf()
    conf.set("spark.master", "local[*]")
    conf = conf.setAppName('Learning PySpark')
    sc = SparkContext(conf=conf)
    df = sc\
        .textFile('IXQ_20170622080001.csv')\
        .map(lambda line: line.split(','))
    print(df.take(5))
def main():
    conf = SparkConf()
    conf.setMaster('local[*]')
    conf.setAppName('renewer-prediction-spark')
    filename = '/Users/andyyoo/scikit_learn_data/renewer/Orange_Dataset.no.header.csv'
    sc = SparkContext(conf=conf)
    df = read_dataset(sc, filename)
    df = pipe_index_string_cols(df, cols=["label"])
    df = pipe_assemble_features(df, excluded_cols=["label"])
    df = pipe_scale_cols(df, with_mean=True, with_std=True, use_dense_vector=False)
    df.show()

    training_data, test_data = train_test_split(df, 0.2)
    model = rf_classifier().fit(training_data)
    predictions = model.transform(test_data)
    print predictions.show(20)

    (roc_score, pr_score) = evaluate(predictions, ['areaUnderROC', 'areaUnderPR'])
    print "\nSpark AUC Score: ", roc_score, ", PR Score: ", pr_score
def main():
    conf = SparkConf()
    conf.setMaster("spark://192.168.199.123:8070")
    conf.setAppName("User Profile Spark")

    sc = SparkContext(conf=conf)
    print("connection sucessed with Master", conf)
    data = [1, 2, 3, 4]
    distData = sc.parallelize(data)
    print(distData.collect())
    #
    raw = open(TRACKS_PATH, 'r').read().split("\n")
    tackfile = sc.parallelize(raw)

    tackfile = tackfile.filter(lambda line: len(line.split(',')) == 6)
    tbycust = tackfile.map(lambda line: make_tracks_kv(line)).reduceByKey(lambda a, b: a + b)

    custdata = tbycust.mapValues(lambda a: compute_stats_byuser(a))

    print(custdata.first())
Exemple #31
0
def geopyspark_conf(master=None, appName=None, additional_jar_dirs=[]):
    """Construct the base SparkConf for use with GeoPySpark.  This configuration
    object may be used as is , or may be adjusted according to the user's needs.

    Note:
        The GEOPYSPARK_JARS_PATH environment variable may contain a colon-separated
        list of directories to search for JAR files to make available via the
        SparkConf.

    Args:
        master (string): The master URL to connect to, such as "local" to run
            locally with one thread, "local[4]" to run locally with 4 cores, or
            "spark://master:7077" to run on a Spark standalone cluster.
        appName (string): The name of the application, as seen in the Spark
            console
        additional_jar_dirs (list, optional): A list of directory locations that
            might contain JAR files needed by the current script.  Already
            includes $(pwd)/jars.

    Returns:
        SparkConf
    """

    conf = SparkConf()

    if not appName:
        raise ValueError("An appName must be provided")
    else:
        conf.setAppName(appName)

    if master:
        conf.setMaster(master)

    if 'GEOPYSPARK_JARS_PATH' in os.environ:
        additional_jar_dirs = additional_jar_dirs + os.environ[
            'GEOPYSPARK_JARS_PATH'].split(':')

    conf.set(key='spark.ui.enabled', value='false')
    conf.set(key='spark.serializer',
             value='org.apache.spark.serializer.KryoSerializer')
    conf.set(key='spark.kryo.registrator',
             value='geopyspark.geotools.kryo.ExpandedKryoRegistrator')

    current_location = os.path.dirname(os.path.realpath(__file__))
    cwd = os.getcwd()

    local_prefixes = [
        os.path.abspath(os.path.join(current_location, 'jars')),
        os.path.abspath(os.path.join(cwd, 'jars')),
        os.path.abspath(os.path.join(cwd, '../geopyspark/jars'))
    ]
    possible_jars = [
        os.path.join(prefix, '*.jar')
        for prefix in local_prefixes + additional_jar_dirs
    ]
    configuration = os.path.join(current_location, 'command',
                                 'geopyspark.conf')

    if not possible_jars:
        if os.path.isfile(configuration):
            with open(os.path.join(configuration)) as config_file:
                possible_jars.append(os.path.relpath(config_file.read(), cwd))

    module_jars = [os.path.abspath(resource_filename('geopyspark.jars', JAR))]

    jar_dirs = [(jar, os.path.dirname(jar)) for jar in module_jars]

    for jar, jar_dir in jar_dirs:
        if jar_dir not in local_prefixes:
            possible_jars.append(jar)

    returned = [glob.glob(jar_files) for jar_files in possible_jars]
    jars = [jar for sublist in returned for jar in sublist]

    if not jars:
        raise IOError(
            "Failed to find any jars. Looked at these paths {}".format(
                possible_jars))

    jar_string = ",".join(set(jars))
    conf.set(key='spark.jars', value=jar_string)
    conf.set(key='spark.driver.memory', value='8G')
    conf.set(key='spark.executor.memory', value='8G')

    return conf
Exemple #32
0
#-*- coding: utf-8 -*-
#!env/bin/python

from pyspark import SparkConf, SparkContext

conf = SparkConf()
conf.setAppName("PYSPARK_JOB_NAME")
sc = SparkContext(conf=conf)
Exemple #33
0
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

from pyspark import SparkContext
from pyspark import SparkConf

conf = SparkConf()
conf.setAppName('Spark Quick Start Sample')

sc = SparkContext(conf=conf)

f = sc.textFile('/project/public/PGYR15/OP_PGYR2015_README_P01172017.txt')

word_counts = f.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(
    lambda a, b: a + b).sortBy(lambda x: x[1], ascending=False).collect()

fout = open("quick-start-work-count.txt", 'w')

for w in word_counts:
    fout.write("%s: %d\n" % w)

fout.close()
Exemple #34
0
from pyspark import SparkContext, SparkConf
import re

conf = SparkConf()
# To run this file with 'spark-submit' set master to 'yarn-cluster'
# conf.setMaster("local")
conf.setAppName("DateTime")
# Creates SparkContext for the Main entry point of Spark functionality
sc = SparkContext(conf=conf)

# Read the input file
# For remote cluster set remote host_name:port instead of localhost:9000
lines = sc.textFile("/Data/NASA_Access_Log")


# Method to split the input line and returns hour value
def getKey(line):
    # Replace '- ' with empty value, so input lines are separated only by white space
    str = line.replace("- ", " ")
    # Replace multiple spaces by single space
    str = ' '.join(str.split())
    # Split the input line by white space unless text enclosed with in double quotes and '[]' and stores the each field as string array
    str = re.findall('\[[^\]]*\]|\"[^\"]*\"|\S+', str)
    # Get the timestamp value and stores the hour value
    str = str[1].replace("[", "").split(":")[1]
    return str


# Creates the Key-Value pair with the hour value as the key and the integer 1 as the value
pairs = lines.map(lambda s: (getKey(s), 1))
# Creates the Key-Value pair with the hour value as the key and the corresponding count as the value

def main(spark, bucket, input_file_name):
    """Run ETL pipeline"""

    # GCS bucket name to create and output tables to
    input_data = "gs://" + bucket + "raw/"
    output_data = "gs://" + bucket + "transformed/"
    print(spark)
    # process_song_data(spark, input_data, output_data)
    process_log_data(spark, input_data, output_data, input_file_name)


if __name__ == "__main__":
    spark_conf = SparkConf()
    spark_conf.setAppName('Sparkify etl')
    spark_context = SparkContext(conf=spark_conf)
    sqlContext = SQLContext(spark_context)
    spark = SparkSession.builder.getOrCreate()
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--bucket',
        dest='bucket',
        required=True,
        help='Specify the full GCS wildcard path to the json files to enhance.'
    )

    parser.add_argument(
        '--raw_file_name',
        dest='raw_file_name',
Exemple #36
0
import numpy as np

from sklearn.cross_validation import train_test_split, Bootstrap
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets, svm, pipeline
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDClassifier

if __name__ == '__main__':
    conf = SparkConf()
    conf.setMaster("spark://172.18.109.87:7077")
    # conf.setMaster("local")
    conf.setAppName("spark_svm")
    conf.set("spark.executor.memory", "12g")
    sc = SparkContext(conf=conf)
    X, y = make_classification(n_samples=10000, n_features=30, n_classes=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    samples = sc.parallelize(Bootstrap(y.size))
    feature_map_fourier = RBFSampler(gamma=.2, random_state=1)
    fourier_approx_svm = pipeline.Pipeline([("feature_map",
                                             feature_map_fourier),
                                            ("svm", SGDClassifier())])
    fourier_approx_svm.set_params(feature_map__n_components=700)
    results = samples.map(lambda (index, _):
                          fourier_approx_svm.fit(X[index], y[index]).score(X_test, y_test)) \
                          .reduce(lambda x,y: x+y)
    final_results = results / len(Bootstrap(y.size))
    data = sock.recv(1024)
    sock.close()

    print("Got data from stream.py")
    print(data)
    print(data.decode("utf-8"))
    return data.decode("utf-8").replace("#", "hashtag-").lower()


hashtagIndex = getHashtagData()
initES(hashtagIndex)

# Pyspark
# create spark configuration
conf = SparkConf()
conf.setAppName('TwitterApp')
conf.setMaster('local[2]')

# create spark context with the above configuration
sc = SparkContext(conf=conf)

# set the log level to one of ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
# sc.sparkContext.setLogLevel("OFF")
sc.setLogLevel("ERROR")

# create the Streaming Context from spark context with interval size 4 seconds
ssc = StreamingContext(sc, 4)
ssc.checkpoint("checkpoint_TwitterApp")

# read data from port 900
dataStream = ssc.socketTextStream(TCP_IP, TCP_PORT)
Exemple #38
0
#!/usr/bin/python
from pyspark import SparkContext, SparkConf
import sys,os,math
import json

tableName=sys.argv[1]
xcol=int(sys.argv[2])
ycol=int(sys.argv[3])
gcol=sys.argv[4:]
for i in range(0, len(gcol)):
    gcol[i] = int(gcol[i])

conf = SparkConf()
conf.setAppName("MyWordCount")
conf.setMaster("spark://ubuntu:7077")
sc = SparkContext(conf=conf)

rdd = sc.textFile("hdfs://localhost:9000/Tables/" + tableName)

class T1:
    def __init__:
        self.val = 1


def mysplit(line):
    items = line.split("#")
    glist=[]
    for i in range(0, len(gcol)):
        glist.append(items[gcol[i]])

    return (tuple(glist),(items[xcol],items[ycol]))
Exemple #39
0
 def initSparkConf(isLocal, appName):
     conf = SparkConf()
     conf.setAppName(appName)
     if isLocal is True:
         conf.setMaster("local")
     return conf
Exemple #40
0
    https://colab.research.google.com/drive/1HM0bHJ8wC333y_TUjb8-Ja3V-DD3AFNA
"""

import sys
import re
import math
from collections import Counter
!pip install pyspark
from pyspark import SparkContext, SparkConf
from random import randint
import shutil
# shutil.rmtree('')
# !unzip TF_index.zip
conf = SparkConf()
conf.setMaster('local')
conf.setAppName('TF/IDF')
sc = SparkContext.getOrCreate(conf=conf)

# Read CTF_index
# a. Retrieve cosine normalized vector of the query words from CTF_index
def st_to_dict(line):
  line = line.split("@")
  line1 = line[1].split("+")
  return (line[0], line[1])


# read data
# Edit file name to give your input data
dataFile = 'CTF_index'
# create RDDS, read stopwords file
stopWordFile ='stopwords-en.txt'
Exemple #41
0
from pyspark import SparkConf, SparkContext

conf = SparkConf()
conf.setAppName("Cancelled OrderOver1000 Pyspark")

sc = SparkContext(conf=conf)


def parseOrders(rec):
    parts = rec.split(",")
    return (int(parts[0]), parts[1], int(parts[2]), parts[3])


def parseOrderItems(rec):
    parts = rec.split(",")
    return (int(parts[0]), int(parts[1]), int(parts[2]), int(parts[3]),
            float(parts[4]), float(parts[5]))


path = "/user/hive/warehouse/retail_db.db"

orders = sc.textFile(path + "/orders").map(lambda x: parseOrders(x)).map(
    lambda x: (x[0], (x[1], x[2], x[3])))

orderItems = sc.textFile(path + "/order_items").map(
    lambda x: parseOrderItems(x)).map(lambda x: (x[1], x[4]))

# always filter as early as possible before doing joins, sort or aggregates
ordersCancelled = orders.filter(lambda x: x[1][2].upper() == "CANCELED")

ordersJoin = ordersCancelled.join(orderItems).map(
Exemple #42
0
#Command: spark-submit spark_wc.py
from pyspark import SparkConf, SparkContext

#Spark set-up
conf = SparkConf()
conf.setAppName("Word count App")
sc = SparkContext(conf=conf)

#uncomment the sc.setLoglevel line, when your program works fine.
#Run the program again to take the screenshot.
#sc.setLogLevel("WARN")

# Upload data file in Hadoop and provide its path in textFile function
rdd = sc.textFile("/user/spark/words.txt")
rdd = rdd.flatMap(lambda x: x.split(' '))
rdd = rdd.map(lambda x: (x, 1))
# Add few lines of code below

rdd = rdd.reduceByKey(lambda x, y: x + y)
# Add your code below
#
#
# you may store top 10 results in 'out' variable
# and use it to display as mentioned below.
for item in out:
    print item[0], '\t:\t', str(item[1])
def date_boro_aggr(x):
    """
    convert the [ [(year-month,borough),count],[(year-month,borough),count],[(year-month,borough),count],[(year-month,borough),count],[(year-month,borough),count]] where year-month is the same for all the entries of this array since we do group be before this mapping and boroughs are distinct
    to [list of aggregated borough counts] (we will have six versions of this each representing apr may jun of 2014 and 2015)
    """
    temp = [0, 0, 0, 0, 0]
    for ele in x[1]:
        temp[int(ele[0][1])] += int(ele[1])
    return temp


n_of_periods = 3  #3 month for now

conf = SparkConf()
conf.setAppName("Residential_Analysis")

# create spark context with the above configuration
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

#------Taxi Analysis

# read the csv file  which has no header
taxiFileWithNoHeader = sc.textFile("/taxi_combined.csv")

# map each entry to ((year-month, borough integer representaion),1)
taxi_date_boro = taxiFileWithNoHeader.map(lambda line: line.split(",")).map(
    date_boro_mapper)

# reduce each mapped tuple by borough and year-month, then group by year-month so that we will have combined
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
import binascii

conf = SparkConf()
conf.setAppName('basestation-analyze')
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 10)
ssc.checkpoint("checkpoint")

initialStateRDD = sc.parallelize([])

stream_data = ssc.textFileStream("hdfs:///data/")


def decode_line(line):
    try:
        l = line.split(" ")
        if l[1] == "DCI":
            return [l[2]]
        else:
            return [""]
    except:
        return [""]


def message_count(m):
    return m, 1

Exemple #45
0
import os
import sys
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

os.environ[
    'PYSPARK_SUBMIT_ARGS'] = '--jars spark-streaming-kafka-assembly_2.11-1.6.3.jar pyspark-shell'
conf = SparkConf()
#conf.setMaster("spark://localhost:7077")
conf.setAppName("Test")
sc = SparkContext(conf=conf)

ssc = StreamingContext(sc, 10)  # 2 second window
kvs = KafkaUtils.createStream(ssc, "localhost:2181", "simpleConsumer",
                              {"test": 1})
lines = kvs.map(lambda x: x[1])
counts = lines.flatMap(lambda line: line.split(" ")).map(
    lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
counts.pprint()

ssc.start()
ssc.awaitTermination()
        hashtags_df.registerTempTable("hashtags")
        # get the top 10 hashtags from the table using SQL and print them
        hashtag_counts_df = sql_context.sql(
            "select hashtag, hashtag_count from hashtags order by hashtag_count desc limit 10"
        )
        hashtag_counts_df.show()
        writeTopElements(hashtag_counts_df)

    except:
        e = sys.exc_info()[0]
        print("Error: %s" % e)


# create spark configuration
conf = SparkConf()
conf.setAppName("TwitterStreamApp")
# create spark context with the above configuration
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
# create the Streaming Context from the above spark context with interval size 2 seconds
ssc = StreamingContext(sc, 10)
# setting a checkpoint to allow RDD recovery
ssc.checkpoint("checkpoint_TwitterApp")
# read data from port 9009
dataStream = ssc.socketTextStream("localhost", 9008)
# split each tweet into words
words = dataStream.flatMap(lambda line: line.split(" "))
# filter the words to get only hashtags, then map each hashtag to be a pair of (hashtag,1)
hashtags = words.map(lambda x: (x, 1))
# adding the count of each hashtag to its last count
tags_totals = hashtags.countByWindow(10 * 60 * 60, 30)
Exemple #47
0
from pyspark import SparkConf, SparkContext

from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

from pyspark.ml.linalg import DenseVector
from pyspark.ml.classification import LogisticRegression

# Spark set-up
conf = SparkConf()
conf.setAppName("Logistic regression")

sc = SparkContext(conf=conf)
sc.setLogLevel("WARN")

spark = SparkSession(sc)

# Load dataset file as RDD
rdd = sc.textFile("/user/spark/task5.txt")
rdd = rdd.map(lambda x: x.split(','))


def renameLabel(x):
    if x[4] == 'Iris-setosa':
        x[4] = 1
    elif x[4] == 'Iris-versicolor':
        x[4] = 2
    else:
        x[4] = 3
    return x
Exemple #48
0
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession, Row, SQLContext, DataFrame
from pyspark.sql.types import StructType, StructField, FloatType, StringType, IntegerType
import sys
import findspark
findspark.init()

#paths for players.csv and teams.csv
players_csv_path = 'hdfs://localhost:9000/input/players.csv'
teams_csv_path = 'hdfs://localhost:9000/input/teams.csv'

# stores per match metrics for each player that resets at the end of the match
player_metrics = dict()

conf = SparkConf()
conf.setAppName('BigData Project')

spark_context = SparkContext(conf=conf, master="local[*]")
streaming_context = StreamingContext(spark_context, 2)

streaming_context.checkpoint('BigData Project Checkpoint')
input_stream = streaming_context.socketTextStream('localhost', 6100)

sqlContext = SQLContext(spark_context)

#loading players.csv as a dataframe
players_data_df = sqlContext.read.csv(players_csv_path, header=True)

#loading teams.csv as a dataframe
teams_data_df = sqlContext.read.csv(teams_csv_path, header=True)
Exemple #49
0
    name = str(row[0])
    numbers = [int(i) for i in row[1:] ]
    tuples = itertools.permutations(numbers,2)
    pc =  max(tuples, key=pewT)
    return [name, pc[0], pc[1]]


if __name__ == "__main__":    
    parser = argparse.ArgumentParser(description='PowerCouples Serial native version') 
    parser.add_argument('-i','--input', dest="input_csv", help="input file in csv format", required=True)
    parser.add_argument('-o','--output', dest="output_csv", help="output file in csv format", default=sys.stdout, type=argparse.FileType('w'))

    args = parser.parse_args()

    # set the spark context
    conf = SparkConf()
    #conf.setMaster("local[4]")
    conf.setAppName("PowerCouples")
    sc = SparkContext(conf=conf)

    # compute power couples
    infile = sc.textFile(args.input_csv,40)
    result = infile.map(find_powerCouple).map(lambda elem: elem[0]+","+str(elem[1])+","+str(elem[2])).collect()

    # write results
    out = csv.writer(args.output_csv)
    for row in result:
        out.writerow([row])


def generate_model_package(training_data_path, id_cols, target_cols,
                           fields_config_file, param_grid, model_name,
                           target_var):
    """
            training_data_path
            ,id_cols
            ,target_cols
            ,fields_config_file
            ,param_grid
            ,model_name
            ,target_var
    """

    pyspark_app_nm = "train_" + model_name + "_" + secrets.token_hex(nbytes=4)

    logging.info("Starting process: " + pyspark_app_nm)

    #create spark object and spark context for parallel learning
    logging.info("Instantiating pyspark.")
    app_pyspark_conf = SparkConf()
    app_pyspark_conf.setAppName(pyspark_app_nm)
    #     app_pyspark_conf.set('spark.executor.memory',spark_executor_memory)
    #     app_pyspark_conf.set('spark.executor.cores', spark_executor_cores)

    spark = SparkSession.builder.config(conf=app_pyspark_conf).getOrCreate()
    sc = spark.sparkContext

    #load data
    logging.info("Beginning data load.")
    training_df = pd.read_parquet(training_data_path, engine='pyarrow')
    # sampling down
    #     training_df_1 = training_df[training_df[target_var]==1].sample(20)
    #     training_df_0 = training_df[training_df[target_var]==0].sample(40)
    #     training_df = pd.concat([training_df_0,training_df_1])

    # column handling
    logging.info("Creating column lists")
    all_cols = training_df.columns.tolist()
    x_cols = list(set(all_cols) - (set(target_cols + id_cols)))

    # dataframe setup
    X = training_df[x_cols]
    y = training_df[target_cols]

    # create holdout data
    logging.info("Creating holdout data")
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        y[target_var],
                                                        test_size=0.1,
                                                        stratify=y[target_var])

    wts = y_test.value_counts()
    wtrat = (wts[0] / wts[1])

    # instantiate model
    gbm = lgb.LGBMClassifier()

    fit_params = {
        "eval_set": [(x_test, y_test)],
        "eval_metric": ear_stop_eval_mtr,
        "early_stopping_rounds": ear_stop_rnds
        #         ,"scale_pos_weight": wtrat
    }

    grid_search = SparkGridSearchCV(sc,
                                    estimator=gbm,
                                    param_grid=param_grid,
                                    fit_params=fit_params)
    #     grid_search.fit(x_train,y_train)

    grid_search.fit(x_train, y_train)

    best_model = grid_search.best_estimator_
    optimized_parameters = best_model.get_params()

    # create confusion dataframe
    y_true = pd.DataFrame(y_test)
    y_true = y_true.reset_index()
    y_true.columns.values[0] = "CUSTOMER_KEY"
    y_true.columns.values[1] = "Y_TRUE"

    y_pred = pd.DataFrame(best_model.predict(x_test, y_test.tolist()),
                          columns=["Y_PRED"])

    confusion_data = pd.merge(left=y_true,
                              right=y_pred,
                              left_index=True,
                              right_index=True)

    # summary statistics and metrics

    fr_col_nam_map = {0: "feature_nm", 1: "feature_importance"}
    feature_ranking = pd.DataFrame(
        [X.columns, best_model.feature_importances_]).T
    feature_ranking = feature_ranking.rename(columns=fr_col_nam_map)
    feature_ranking = feature_ranking.sort_values("feature_nm",
                                                  ascending=False)

    metrics = {
        "precision_score":
        precision_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']),
        "roc_auc_score":
        roc_auc_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']),
        "classification_report":
        classification_report(confusion_data['Y_TRUE'],
                              confusion_data['Y_PRED']),
        "confusion_matrix":
        confusion_matrix(confusion_data['Y_TRUE'], confusion_data['Y_PRED']),
        "accuracy_score":
        accuracy_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']),
        "precision_recall_curve":
        precision_recall_curve(confusion_data['Y_TRUE'],
                               confusion_data['Y_PRED']),
        "recall_score":
        recall_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']),
        "roc_curve":
        roc_curve(confusion_data['Y_TRUE'], confusion_data['Y_PRED'])
    }

    output = {
        "model_name": model_name  # string with model name
        ,
        "model_class": best_model  # grid_search.best_estimator_
        ,
        "optimized_parameters": optimized_parameters  # best_model.get_params()
        ,
        "feature_ranking": feature_ranking  # best_model.feature_importances_
        ,
        "metrics": metrics,
        "confusion_data": confusion_data
    }

    return output
Exemple #51
0
    "-output",
    "--output",
    help="Complete output path for results ex. hdfs:/CCF/output")
parser.add_argument("-partition",
                    "--partition",
                    type=int,
                    help="Number of partitions for dataset")
args = parser.parse_args()
partition_number = args.partition
input_file_path = args.input
output_directory = args.output

# Initialize spark-context configuration
conf = SparkConf()
conf.setMaster('local')
conf.setAppName('pyspark-shell-CCF-SS-v2')
# Just for local execution
conf.set('spark.driver.host', '127.0.0.1')
conf.set("spark.ui.proxyBase", "")  # Just for having a nice gui locally
os.environ[
    'PYSPARK_PYTHON'] = '/Users/ccompain/.pyenv/versions/miniconda3-latest/bin/python'  # Needs to be explicitly provided as env. Otherwise workers run Python 2.7
os.environ['PYSPARK_DRIVER_PYTHON'] = 'python'

sc = SparkContext(conf=conf)
sc.setLogLevel("WARN")

# Initialize logger
log4jLogger = sc._jvm.org.apache.log4j
LOGGER = log4jLogger.LogManager.getLogger(__name__)

LOGGER.warn("################################")
        flightNum_1 = item['flightNum_1']
        flightNum_2 = item['flightNum_2']
        origin_2 = item['origin_2']
        total_delay = item['total_delay']
        table.put_item(Item=item)


def saveTopCarriers(carriers):
    sorted = carriers.sortBy(lambda item: item['total_delay'])
    for toSave in sorted.take(1):
        save_results(toSave)


if __name__ == '__main__':
    conf = SparkConf()
    conf.setAppName("Problem_3-2")
    conf.set("spark.streaming.kafka.maxRatePerPartition", 50000)
    conf.set("spark.executor.memory", "2g")
    conf.set("spark.python.worker.memory", "1g")

    # airports
    airports = ['CMI', 'ORD', 'LAX', 'JAX', 'DFW', 'CRP', 'SLC', 'BFL', 'SFO', 'PHX', 'JFK']
    year = "2008"

    sc = SparkContext(conf=conf)
    sc.setLogLevel("WARN")
    ssc = StreamingContext(sc, 2)
    ssc.checkpoint("/tmp/streaming")

    brokers = "b-1.cs598-tast2.n69c9p.c2.kafka.us-east-1.amazonaws.com:9092,b-2.cs598-tast2.n69c9p.c2.kafka.us-east-1.amazonaws.com:9092"
    topic = "cs598-task2"
Exemple #53
0
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import Row, SQLContext
import sys
import requests
configuration = SparkConf()
configuration.setAppName("BigData")


def rc(line):

    t = line.split(";")[7]
    if ',' not in t:
        return [t]
    return t.split(",")


def f1(r):
    sr = r.sortBy(lambda x: (-x[1], x[0]))
    srr = sr.collect()
    c = 0
    i = 0
    if (srr != []):
        while (c != 5):
            if (srr[i][0] != ''):
                if (c != 4):
                    print(srr[i][0], end=',')
                else:
                    print(srr[i][0])
Exemple #54
0
					k=0
			#tags_count = [p.hashtag_count for p in df.select("hashtag_count").collect()]
			#hashtag_counts_df.pprint()
		except:
			e = sys.exc_info()[0]
			#print("Error: %s" % e)

def tmp(x):
	return (x.split(';')[0],1)
if __name__ == "__main__":
	if len(sys.argv) != 3:
		print("Usage: <file> <windowsize> <batchinterval>", file=sys.stderr)
		sys.exit(-1)

	conf=SparkConf()
	conf.setAppName("BigData")
	sc=SparkContext(conf=conf)

	ssc=StreamingContext(sc,int(sys.argv[2]))
	ssc.checkpoint("/checkpoint_BIGDATA")
	dataStream=ssc.socketTextStream("localhost",9009)
	# dataStream.pprint()
	#tweet=dataStream.map(tmp)
	# OR
	dataStream=dataStream.window(int(sys.argv[1]),1)
	tweet=dataStream.flatMap(lambda w:(w.split(';')[7].split(',')))
	hashtag=tweet.map(lambda w:(w,1))
	#hashtag.pprint()
	count=hashtag.reduceByKey(lambda x,y:x+y)
	#count.pprint()
Exemple #55
0
        results_file.write(','.join(output) + '\n\n')

        for freq_set in result_frequent_itemsets[1:]:
            results_file.write(','.join(map(str, (sorted(freq_set)))) + '\n\n')


if __name__ == '__main__':
    start_time = time.time()

    # initialize spark
    conf = SparkConf()
    conf.set("spark.driver.memory", "4g")
    conf.set("spark.executor.memory", "4g")
    conf.setMaster('local[8]')
    conf.setAppName('Assignment_2')
    sc = SparkContext.getOrCreate(conf)

    # get args
    threshold = int(sys.argv[1])
    support = int(sys.argv[2])
    input_file = sys.argv[3]
    result_file = sys.argv[4]

    # create baskets rdd
    data = sc.textFile(input_file).map(lambda x: x.split(',')).map(
        lambda x: (x[0], x[1]))
    header = data.first()
    raw_data = data.filter(lambda x: x != header)

    baskets = raw_data.groupByKey().map(lambda x: (list(set(x[1])))).filter(
Exemple #56
0
        i += 1


def cleanData(x):
    hashtags = x.split(",")
    clean = []
    for hashtag in hashtags:
        if hashtag == " " or hashtag == "  " or hashtag == "":
            pass
        else:
            clean.append(hashtag)
    return clean


conf = SparkConf()
conf.setAppName("A2")
sc = SparkContext(conf=conf)

batch_size = sys.argv[2]
window_size = sys.argv[1]

ssc = StreamingContext(sc, float(batch_size))
ssc.checkpoint("/checkpoint_BIGDATA")

dataStream = ssc.socketTextStream("localhost", 9009)

tweet = dataStream.map(lambda w: w.split(';')[7])

tweet1 = tweet.flatMap(lambda w: cleanData(w))

tweet1 = tweet1.map(lambda x: (x, 1))
Exemple #57
0
from pyspark import SparkContext, SparkConf

logFile = "loggy.md"  # Should be some file on your system
conf = SparkConf()
conf.setMaster("local[4]")
conf.setAppName("Simple Khan")
conf.set("spark.executor.memory", "1g")
sc = SparkContext(conf=conf)

logData = sc.textFile(logFile).cache()
keywords = ['Scala', 'Python']


def counter(line):
    return any(k in line for k in keywords)


numAs = logData.filter(counter).count()

print "Lines with keywords: %i " % (numAs)
Exemple #58
0
from pyspark import SparkConf
from pyspark import SparkContext


sparkconfig = SparkConf()
sparkconfig.setMaster("local[*]")
sparkconfig.setAppName("SparkCSVJOB")

def print_each_line(eachLine):
    print eachLine
    return


sparkcontext = SparkContext(conf= sparkconfig)

textFileRDD = sparkcontext.textFile("/home/dharshekthvel/Downloads/query_result.csv")

textFileRDD.foreach(print_each_line)


from pyspark import SparkConf,SparkContext
from operator import add
import string
import nltk
from nltk.corpus import stopwords
import re

conf=SparkConf()
conf.setAppName("Similarity Index")
conf.set("spark.executor.memory","2g")
conf.set("spark.ui.port","4098")
sc=SparkContext(conf=conf)

path="/cosc6339_hw2/gutenberg-500/"

#popular words
text=sc.textFile(path)
words = text.flatMap(lambda line:line.lower().split())
word = words.map(lambda x: re.sub('\W+','',x))
stops = set(stopwords.words('english'))
wordt = word.map(lambda x: ''.join([w1 for w1 in x.split() if w1 not in (stops)]))
wcounts= wordt.map(lambda w: (w, 1) )
counts = wcounts.reduceByKey(add, numPartitions=1)
count1 = counts.map(lambda (a,b) : (b,a))
count2 = count1.sortByKey(False)
count = count2.map(lambda (a,b) : (b,a))
count3 = count.take(1000)
count4 = sc.parallelize(count3,1)
removePunct=(lambda x:x not in string.punctuation)
finalWords=[]
out=count4.collect()
Exemple #60
0
import numpy as np
import pandas as pd
import pickle
import io
import time
from pyspark import SparkContext
from pyspark import SparkConf

import sys

conf = SparkConf()
conf.setMaster("spark://0.0.0.0:7077")
conf.setAppName("NumpyMult")
sc = SparkContext(conf=conf)


def addToServer(image):
    from elasticsearch import Elasticsearch
    from minio import Minio
    from minio.error import ResponseError

    es = Elasticsearch(['http://elasticsearch:9200'])
    minioClient = Minio('minio:9000',
                        access_key='minio',
                        secret_key='minio123',
                        secure=False)
    ret = ""
    try:
        t = time.time()
        buf = pickle.dumps(image[0])