def start():
    sconf = SparkConf()
    sconf.set('spark.cores.max', 2)
    sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
    ssc = StreamingContext(sc, 2)

    brokers = "192.192.0.27:9092"
    topics = ['topic7']

    kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})

    lines1 = kafkaStreams_lines.map(lambda x: x[1])  # 注意 取tuple下的第二个即为接收到的kafka流

    words = lines1.flatMap(lambda line: line.split(" "))

    pairs = words.map(lambda word: (word, 1))

    wordcounts = pairs.reduceByKey(lambda x, y: x + y)

    wordcounts.saveAsTextFiles("/var/lib/hadoop-hdfs/spark-libin/kafka")

    wordcounts.pprint()
    # 统计生成的随机数的分布情况
    ssc.start()  # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
Example #2
0
def main(args):

    if len(args) < 2:
        sys.exit(1)

    # Setting the cluster configuration parameters
    spark_master = args[0]
    spark_data_file_name = args[1]
    file_path = CURR_DIR + "/" + spark_data_file_name

    conf = SparkConf()
    conf.setMaster(spark_master)
    conf.setAppName("Log Scanner")

    # Creating a Spark Context with conf file
    sc = SparkContext(conf=conf)

    txt_logs = sc.textFile(file_path).filter(lambda line: check(line))
    access_logs = txt_logs.map(lambda line: AccessLog(line))

    #  Getting response_codes from log objects and caching it
    response_codes = access_logs.map(lambda log: log.get_status()).cache()
    log_count = response_codes.count()
    print("Total Resonse Codes: " + str(log_count))
    cnt = response_codes.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
    response200 = cnt.filter(lambda x: x[0] == "200").map(lambda (x, y): y).collect()
    print("###########################")
    print("##  Success Rate : " + str(int(response200[0])*100/log_count) + " %  ##")
    print("###########################")
Example #3
0
def main():
    """
    Main entry point of the application
    """

    # Create spark configuration and spark context
    include_path = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'preprocessing.py'))
    conf = SparkConf()
    conf.set('spark.executor.memory', '1500m')
    conf.setAppName("Generating predictions")
    sc = SparkContext(conf=conf, pyFiles=[include_path])

    # Set S3 configuration
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", os.environ['AWS_ACCESS_KEY'])
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", os.environ['AWS_SECRET_KEY'])

    # Single-pass predictions
    fast_predict(sc, file_input="s3n://twitter-stream-data/twitter-*",
                 file_output="s3n://twitter-stream-predictions/final",
                 sports_model="PyTwitterNews/models/sports.model",
                 politics_model="PyTwitterNews/models/politics.model",
                 technology_model="PyTwitterNews/models/technology.model")

    # Stop application
    sc.stop()
Example #4
0
def get_default_spark_conf():
    conf = SparkConf(). \
        setAppName("pyunit-test"). \
        setMaster("local-cluster[3,1,2048]"). \
        set("spark.ext.h2o.disable.ga","true"). \
        set("spark.driver.memory", "2g"). \
        set("spark.executor.memory", "2g"). \
        set("spark.ext.h2o.client.log.level", "DEBUG"). \
        set("spark.ext.h2o.repl.enabled", "false"). \
        set("spark.task.maxFailures", "1"). \
        set("spark.rpc.numRetries", "1"). \
        set("spark.deploy.maxExecutorRetries", "1"). \
        set("spark.network.timeout", "360s"). \
        set("spark.worker.timeout", "360"). \
        set("spark.ext.h2o.backend.cluster.mode", ExternalClusterTestHelper.cluster_mode()). \
        set("spark.ext.h2o.cloud.name", ExternalClusterTestHelper.unique_cloud_name("test")). \
        set("spark.ext.h2o.external.start.mode", os.getenv("spark.ext.h2o.external.start.mode", "manual")) .\
        set("spark.sql.warehouse.dir", "file:" + os.path.join(os.getcwd(), "spark-warehouse"))


    if ExternalClusterTestHelper.tests_in_external_mode():
        conf.set("spark.ext.h2o.client.ip", ExternalClusterTestHelper.local_ip())
        conf.set("spark.ext.h2o.external.cluster.num.h2o.nodes", "2")

    return conf
Example #5
0
def configureSpark(app_name, master):
	
	#Configure SPARK
	conf = SparkConf().setAppName(app_name)
	conf = conf.setMaster(master)
	spark_context = SparkContext(conf=conf)
	return spark_context
def main():
	conf = SparkConf()
	conf.set("spark.default.parallelism", "24")
	sc = SparkContext(appName="PhoneLab Preprocessing", conf=conf)

	lines = sc.textFile(data_files, use_unicode=False)

	# Create LogLine objects and filter out empty lines
	logs = lines.flatMap(ll_mapper)

	# Save in an intermediate format
	logs.saveAsTextFile(out_dir, compressionCodecClass=codec)
	return

	# Gap detection
	keyed = logs.map(ll_gap_map)
	merged = keyed.groupByKey()

	# At this point we have ((boot_id, date), [line_num]) tuples The last step.
	# is to find all the gaps within each key/tuple.
	result = merged.flatMap(find_gaps)
	gaps = result.collect()

	fd = open("/spark/gaps.json", 'w')
	fd.write(json.dumps(gaps, indent=4))
Example #7
0
    def setUpClass(cls):

        class_name = cls.__name__
        conf = SparkConf()
        conf.set('spark.app.name', 'class_name')

        # Read the spark configuration and update the spark conf
        test_spark_config = ConfigParser.ConfigParser()
        test_spark_config.read('test_config.cfg')
        test_spark_config.sections()
        configs = dict(test_spark_config.items('spark_conf_test_generic'))
        for k, v in configs.items():
            conf.set(k, v)
        cls.spark_test_configs = configs
        # Create the spark context
        cls.sc = SparkContext(conf=conf)
        if 'PYSPARK_DRIVER_PYTHON' in configs.keys():
            cls.sc.pythonExec = configs['PYSPARK_DRIVER_PYTHON']
        else:
            cls.sc.pythonExec = 'python2.7'

        logger = cls.sc._jvm.org.apache.log4j
        logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
        logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)

        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s: %(message)s')
        cls.logger = logging.getLogger(__name__)
        cls.logger.setLevel(logging.DEBUG)
def main():
    parser = argparse.ArgumentParser(
        description='process some log messages, storing them and signaling '
                    'a rest server')
    parser.add_argument('--mongo', help='the mongodb url',
                        required=True)
    parser.add_argument('--rest', help='the rest endpoint to signal',
                        required=True)
    parser.add_argument('--port', help='the port to receive from '
                        '(default: 1984)',
                        default=1984, type=int)
    parser.add_argument('--appname', help='the name of the spark application '
                        '(default: SparkharaLogCounter)',
                        default='SparkharaLogCounter')
    parser.add_argument('--master',
                        help='the master url for the spark cluster')
    parser.add_argument('--socket',
                        help='the socket to attach for streaming text data '
                        '(default: caravan-pathfinder)',
                        default='caravan-pathfinder')
    args = parser.parse_args()
    mongo_url = args.mongo
    rest_url = args.rest

    sconf = SparkConf().setAppName(args.appname)
    if args.master:
        sconf.setMaster(args.master)
    sc = SparkContext(conf=sconf)
    ssc = StreamingContext(sc, 1)

    lines = ssc.socketTextStream(args.socket, args.port)
    lines.foreachRDD(lambda rdd: process_generic(rdd, mongo_url, rest_url))

    ssc.start()
    ssc.awaitTermination()
Example #9
0
def start():
    sconf = SparkConf()
    sconf.set('spark.cores.max', 2)
    sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
    ssc = StreamingContext(sc, 2)

    brokers = "localhost:9092"
    topics = ['test']

    kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})

    lines1 = kafkaStreams_lines.map(lambda x: x[1])  # 注意 取tuple下的第二个即为接收到的kafka流

    words = lines1.flatMap(lambda line: line.split(" "))

    pairs = words.map(lambda word: (word, 1))

    wordcounts = pairs.reduceByKey(lambda x, y: x + y)

    print(wordcounts)

    kafkaStreams_lines.transform(storeOffsetRanges).foreachRDD(printOffsetRanges)

    wordcounts.pprint()
    # 统计生成的随机数的分布情况
    ssc.start()  # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
Example #10
0
def stackexchange_xml_spark_job():
    server = bluebook_conf.HDFS_FQDN
    conf = SparkConf()

    xml_file_address = "hdfs://" + server + "/" +\
                       bluebook_conf.STACKEXCHANGE_XML_FOLDER_NAME +\
                       bluebook_conf.STACKEXCHANGE_XML_FILE_NAME
                         
    json_ques_folder_address = "hdfs://" + server + "/" +\
                               bluebook_conf.STACKEXCHANGE_JSON_QUES_FOLDER_NAME
    json_ans_folder_address = "hdfs://" + server + "/" +\
                              bluebook_conf.STACKEXCHANGE_JSON_ANS_FOLDER_NAME
        
    conf.setAppName('stackexchange_xml_spark_job')
    spark_context = SparkContext(conf=conf)
        
    file = spark_context.textFile(xml_file_address)

    # Ques and Ans files are stored seperately depending of their 'posttypeid'
    # Ques -> posttypeid == 1
    # Ans -> posttypeid == 2
    ques = file.map(stackexchange_xml_mapper)\
               .filter(lambda dic: 'posttypeid' in dic.keys())\
               .filter(lambda dic: dic['posttypeid'] == '1')\
               .map(lambda d: jsoner(d))
    ans = file.map(stackexchange_xml_mapper)\
               .filter(lambda dic: 'posttypeid' in dic.keys())\
               .filter(lambda dic: dic['posttypeid'] == '2')\
               .map(lambda d: jsoner(d))
    ques.saveAsTextFile(json_ques_folder_address)
    ans.saveAsTextFile(json_ans_folder_address)
Example #11
0
    def __call__(self):
        log.info("Processing wiki dump: %s ...", self.wk_dump_path)
        c = SparkConf().setAppName("Wikijson")

        log.info("Using spark master: %s", c.get("spark.master"))
        sc = SparkContext(conf=c)

        if os.path.isdir(self.output_path):
            log.warn("Writing over output path: %s", self.output_path)
            shutil.rmtree(self.output_path)

        # rdd of tuples: (title, namespace, id, redirect, content)
        pages = wikispark.get_pages_from_wikidump(sc, self.wk_dump_path)
        pages.cache()

        articles = wikispark.get_articles_from_pages(pages)
        redirects = wikispark.get_redirects_from_pages(pages)

        if self.redirect_links:
            articles = wikispark.redirect_article_links(articles, redirects)

        articles.map(self.article_to_json).map(json.dumps).saveAsTextFile(
            self.output_path, "org.apache.hadoop.io.compress.GzipCodec"
        )

        log.info("Done.")
 def setUp(self):
     conf = SparkConf().setAppName('testing').setMaster('local[2]').set('spark.driver.host', 'localhost')
     conf.set('spark.ui.showConsoleProgress', False)
     self.session = SparkSession.builder.config(conf=conf).getOrCreate()
     self.test_data = [
         ('Ricardo', 'engineering', 2),
         ('Tisa', 'sales', 3),
         ('Sheree', 'marketing', 4), 
         ('Chantelle', 'engineering', 5),
         ('Kylee', 'finance', 2),
         ('Tamatha', 'marketing', 5),
         ('Trena', 'engineering', 2),
         ('Arica', 'engineering', 1),
         ('Santina', 'finance', 2),
         ('Daria', 'marketing', 1),
         ('Magnolia', 'sales', 2),
         ('Antonina', 'finance', 1),
         ('Sumiko', 'engineering', 1),
         ('Carmen', 'sales', 2),
         ('Delois', 'engineering', 1),
         ('Luetta', 'marketing', 3),
         ('Yessenia', 'sales', 1),
         ('Petra', 'engineering', 3),
         ('Charisse', 'engineering', 4),
         ('Lillian', 'engineering', 3),
         ('Wei', 'engineering', 2),
         ('Lahoma', 'sales', 2),
         ('Lucilla', 'marketing', 1),
         ('Stephaine', 'finance', 2),
     ]
Example #13
0
 def getSparkContext(self, appName, master):
     print(appName)
     print(master)
     conf = SparkConf().setAppName(appName).setMaster(master)
     conf.set("spark.local.ip", "127.0.0.1")
     conf.set("spark.driver.host", "127.0.0.1")
     return SparkContext(conf=conf)
Example #14
0
def main():
    spark_conf = SparkConf().setAppName("Different-Sampling data").setMaster('local[*]')
    spark_conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    sc = SparkContext(conf= spark_conf)
    GA.logInConsole(0, "input file read!")
    rdd = sc.textFile("/home/fatemeh/Data/saveData.txt",  minPartitions= 500, use_unicode=False)
    rdd.unpersist()
#     print('\nNumber of Partitions for this run: ', rdd.getNumPartitions())
    vectorRDD = rdd.map(lambda line: toVector(line, splitter = ' '))
    
    GA.logInConsole(0 , "Data Vectorized!")
    ss = list()
    GA.logInConsole(-1, 'Start the ensemble')
    GA.logInConsole(-10, "GA with range 3")
    ss.append(GA.parallel_GA_main(vectorRDD,sc, 5))
#     GA.logInConsole(-10, "GA with range 4")
#     ss.append(GA.parallel_GA_main(norm,sc, 4))
#     GA.logInConsole(-10, "GA with range 5")
#     ss.append(GA.parallel_GA_main(norm,sc, 5))
#     GA.logInConsole(-10, "GA with range 3 and Sampled data set")
#    sampleRDD = norm.sample(False, 0.6, seed=10)
#    ss.append(GA.parallel_GA_main(sampleRDD,sc, 3))
    print(ss)
    #selectedSS = voted_subsapces(ss)
#     SSD.outlierDetection(vectorRDD, ss)
    GA.logInConsole(100, "\nend of program")
    sc.stop()
Example #15
0
def read_conf():
    """
    Setting up spark contexts
    """
    conf = SparkConf()
    conf.setMaster("local[*]")
    conf.setAppName("Testing")
    return conf
Example #16
0
 def _test_broadcast_on_driver(self, *extra_confs):
     conf = SparkConf()
     for key, value in extra_confs:
         conf.set(key, value)
     conf.setMaster("local-cluster[2,1,1024]")
     self.sc = SparkContext(conf=conf)
     bs = self.sc.broadcast(value=5)
     self.assertEqual(5, bs.value)
Example #17
0
def configureSpark():
	#Configure SPARK
	conf = SparkConf().setAppName("a")
	conf = conf.setMaster("local[*]")
	conf = conf.set("spark.executor.memory", "2g").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").set("spark.kryoserializer.buffer", "256").set("spark.akka.frameSize", "500").set("spark.rpc.askTimeout", "30").set('spark.executor.cores', '4').set('spark.driver.memory','2g')

	sc = SparkContext(conf=conf)
	return sc
Example #18
0
class OWSparkContext(SharedSparkContext, widget.OWWidget):
    priority = 0
    name = "Context"
    description = "Create a shared Spark (sc) and Hive (hc) Contexts"
    icon = "../icons/spark.png"

    want_main_area = False
    resizing_enabled = True

    conf = None

    def __init__(self):
        super().__init__()

        # The main label of the Control's GUI.
        # gui.label(self.controlArea, self, "Spark Context")

        self.conf = SparkConf()
        all_prefedined = dict(self.conf.getAll())
        # Create parameters Box.
        box = gui.widgetBox(self.controlArea, "Spark Application", addSpace = True)

        self.gui_parameters = OrderedDict()

        main_parameters = OrderedDict()
        main_parameters['spark.app.name'] = 'OrangeSpark'
        main_parameters['spark.master'] = 'yarn-client'
        main_parameters["spark.executor.instances"] = "8"
        main_parameters["spark.executor.cores"] = "4"
        main_parameters["spark.executor.memory"] = "8g"
        main_parameters["spark.driver.cores"] = "4"
        main_parameters["spark.driver.memory"] = "2g"
        main_parameters["spark.logConf"] = "false"
        main_parameters["spark.app.id"] = "dummy"

        for k, v in main_parameters.items():
            default_value = all_prefedined.setdefault(k, v)
            self.gui_parameters[k] = GuiParam(parent_widget = box, label = k, default_value = v)
            all_prefedined.pop(k)

        for k, v in all_prefedined.items():
            self.gui_parameters[k] = GuiParam(parent_widget = box, label = k, default_value = str(v))

        action_box = gui.widgetBox(box)
        # Action Button
        self.create_sc_btn = gui.button(action_box, self, label = 'Submit', callback = self.create_context)

    def onDeleteWidget(self):
        if self.sc:
            self.sc.stop()

    def create_context(self):

        for key, parameter in self.gui_parameters.items():
            self.conf.set(key, parameter.get_value())

        self.sc = SparkContext(conf = self.conf)
        self.hc = HiveContext(self.sc)
Example #19
0
def configureSpark(app_name, master):
	
	#Configure SPARK
	conf = SparkConf().setAppName(app_name)
	conf = conf.setMaster(master)
	#conf.set("fs.s3n.awsAccessKeyId", "")
	#conf.set("fs.s3n.awsSecretAccessKey", "")
	spark_context = SparkContext(conf=conf)
	return spark_context
def main():
    spark_conf = SparkConf().setAppName("Different-Sampling data")
    spark_conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    sc = SparkContext(conf= spark_conf)
    rdd = load_data(sc)  
    print(rdd.getNumPartitions())
    parallel_GA_main(sc, rdd, 5)
    
    sc.stop()
Example #21
0
def init_spark_context():
    # load spark context
    conf = SparkConf().setAppName("ctr-server")
    conf.set('spark.kryoserializer.buffer', '512mb')
    conf.set('spark.kryoserializer.buffer.max', '512')
    # IMPORTANT: pass aditional Python modules to each worker
    sc = SparkContext(conf=conf, pyFiles=['/home/ec2-user/engine.py', '/home/ec2-user/app.py'])
 
    return sc
Example #22
0
def create_sc(master=None,
              py_files=None,
              spark_home=None,
              sparktk_home=None,
              pyspark_submit_args=None,
              app_name="sparktk",
              extra_conf=None):
    """
    Creates a SparkContext with sparktk defaults

    Many parameters can be overwritten

    :param master: spark master setting
    :param py_files: list of str of paths to python dependencies; Note the the current python
    package will be freshly zipped up and put in a tmp folder for shipping by spark, and then removed
    :param spark_home: override $SPARK_HOME
    :param sparktk_home: override $SPARKTK_HOME
    :param app_name: name of spark app
    :param extra_conf: dict for any extra spark conf settings, for ex. {"spark.hadoop.fs.default.name": "file:///"}
    :return: pyspark SparkContext
    """

    set_env_for_sparktk(spark_home, sparktk_home, pyspark_submit_args)

    # bug/behavior of PYSPARK_SUBMIT_ARGS requires 'pyspark-shell' on the end --check in future spark versions
    set_env('PYSPARK_SUBMIT_ARGS', ' '.join([os.environ['PYSPARK_SUBMIT_ARGS'], 'pyspark-shell']))

    if not master:
        master = default_spark_master
        logger.info("sparktk.create_sc() master not specified, setting to %s", master)

    conf = SparkConf().setMaster(master).setAppName(app_name)
    if extra_conf:
        for k, v in extra_conf.items():
            conf = conf.set(k, v)

    if not py_files:
        py_files = []

    # zip up the relevant pieces of sparktk and put it in the py_files...
    path = zip_sparktk()
    tmp_dir = os.path.dirname(path)
    logger.info("sparkconf created tmp dir for sparktk.zip %s" % tmp_dir)
    atexit.register(shutil.rmtree, tmp_dir)  # make python delete this folder when it shuts down

    py_files.append(path)

    msg = '\n'.join(["=" * 80,
                     "Creating SparkContext with the following SparkConf",
                     "pyFiles=%s" % str(py_files),
                     conf.toDebugString(),
                     "=" * 80])
    logger.info(msg)

    sc = SparkContext(conf=conf, pyFiles=py_files)

    return sc
Example #23
0
def create_spark_context(app_name="Quiz Bowl", lm_memory=False, profile=False):
    spark_conf = SparkConf()
    if lm_memory:
        pass
        # spark_conf = spark_conf.set('spark.max.cores', 30).set('spark.executor.cores', 30)
    if profile:
        spark_conf = spark_conf.set('spark.python.profile', True)
    spark_conf = spark_conf.set('spark.akka.frameSize', 300)
    return SparkContext(appName=app_name, master=QB_SPARK_MASTER, conf=spark_conf)
Example #24
0
def main():
    # Spark Configurations
    conf = SparkConf()
    conf.set("spark.master", "local[*]")
    conf = conf.setAppName('Learning PySpark')
    sc = SparkContext(conf=conf)
    df = sc\
        .textFile('IXQ_20170622080001.csv')\
        .map(lambda line: line.split(','))
    print(df.take(5))
Example #25
0
    def _configure_spark(self):
        logger.info('Configuring Spark')
        sconf = SparkConf()
        for prop, value in self.sm_config['spark'].iteritems():
            if prop.startswith('spark.'):
                sconf.set(prop, value)

        self.sc = SparkContext(master=self.sm_config['spark']['master'], conf=sconf, appName='SM engine')
        if not self.sm_config['spark']['master'].startswith('local'):
            self.sc.addPyFile(join(local_path(proj_root()), 'sm.zip'))
Example #26
0
def createSparkConf():
	from pyspark import SparkConf
	test_properties = conftest.test_properties()

	conf = SparkConf()
	conf.set("cloudant.host", test_properties["cloudanthost"])
	conf.set("cloudant.username", test_properties["cloudantusername"])
	conf.set("cloudant.password", test_properties["cloudantpassword"])
	
	return conf
Example #27
0
def main():
    # Setting the cluster configuration parameters
    conf = SparkConf()
    conf.setMaster("spark://localhost:7077")
    conf.setAppName("Tweet App")
    conf.set("spark.executor.memory", "3g")
    conf.set("spark.driver.memory", "4g")

    # Creating a Spark Context with conf file
    sc = SparkContext(conf=conf)

    # Creating and SQL context to perform SQL queries
    sqlContext = SQLContext(sc)

    # Define the data path
    curr_path = os.path.dirname(os.path.abspath(__file__))
    json_name = "out.json"

    json_file_path = os.path.join(curr_path +
                                  "/../Spark_Jobs/data/",
                                  json_name)

    parquet_file_path = createSQLContext(json_file_path, sqlContext)
    print(parquet_file_path)

    # Read from parquet file
    parquetFile = sqlContext.read.parquet(parquet_file_path)
    parquetFile.registerTempTable("tweets")
    counter = sqlContext.sql("SELECT count(*) as cnt FROM tweets")
    print("============= Count =================")
    print("Count:: " + str(counter.collect()[0].cnt))
Example #28
0
def home3(request):
	#spark_home = os.environ['SPARK_HOME'] = '/usr/local/spark-1.5.2-bin-2.7.1/' #'/usr/local/spark/'
	#sys.path.insert(0,os.path.join(spark_home,'python'))
	#sys.path.insert(0,os.path.join(spark_home,'python/lib/py4j-0.8.2.1-src.zip'))
	
	#from pyspark import SparkContext, SparkConf	
	#sc = SparkContext()
	#data=[1,2,3,4,5]
	#distData = sc.parallelize(data)
	#first = distData.take(1)
	#sc.stop()

	prefs = ["worldnews","politics","Economics","Libertarian"]
	
	scfg=SparkConf()
	scfg.set("spark.cores.max",64)
        sc=SparkContext(master="spark://final-gateway:7077", appName="reddit-cf", conf=scfg)

        #data=[1,2,3,4,5]
        #distData = sc.parallelize(data)
        #first = distData.take(1)
        #sc.stop()

	try:

                # prep data
                raw_counts = sc.textFile("hdfs://final-gateway/w251_cf-user-site-total")
                parsed_counts = raw_counts.map(lambda st: eval(st))
                all_ratings = parsed_counts.map( tup_to_rating )
                # assign user-identified preferred subreddits
                raw_prefs = [ (999, x, 100) for x in prefs ]
                my_prefs = sc.parallelize(raw_prefs).map(tup_to_rating)

                # train model
                model_input = all_ratings.union(my_prefs)
                #model = ALS.trainImplicit(model_input, 10, 10, alpha=.01)

                # candidate prefs for prediction
                #my_prefs_ids = set([javahash(x) for x in prefs])
                #all_subreddit_ids = parsed_counts.map( lambda (a,b,c): (javahash(b),b) ).distinct().cache()
                #candidates = all_subreddit_ids.map(lambda (a,b): a ).filter( lambda r: r not in my_prefs_ids)

                #predictions = model.predictAll(candidates.map( lambda x: (999, x))).cache()

                #final = predictions.map(lambda (a,b,c): (b,c)).join(all_subreddit_ids).map(lambda (b,(c,d)): (c,d) ).sortByKey(False)

                #output = list( final.take(30) )
                sc.stop()
                #return output
                recommends = ["asfd"] # output
        except Exception, e:
                print("App failed. Stopping gracefully")
                sc.stop()
                raise Exception(e)
Example #29
0
    def __init__(self, master, name):
        self.name=name
        self.master=master

        print "init spark ..."
        os.environ["HADOOP_HOME"]="D:\code\wqr\hadoop-common-2.2.0-bin"
        conf = SparkConf()
        conf.setMaster(self.master)
        conf.setAppName(self.name)

        self.sc = SparkContext(conf=conf)
Example #30
0
 def _test_multiple_broadcasts(self, *extra_confs):
     """
     Test broadcast variables make it OK to the executors.  Tests multiple broadcast variables,
     and also multiple jobs.
     """
     conf = SparkConf()
     for key, value in extra_confs:
         conf.set(key, value)
     conf.setMaster("local-cluster[2,1,1024]")
     self.sc = SparkContext(conf=conf)
     self._test_encryption_helper([5])
     self._test_encryption_helper([5, 10, 20])
Example #31
0
import json
import matplotlib.pyplot as plt
import pandas as pd
import gzip
import locale
from pyspark import SparkConf, SparkContext
from pyspark.sql import Row, SparkSession
from pyspark.sql.types import StructField, StructType, StringType

## CONFIG ##

sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf8', buffering=1)
locale.getdefaultlocale()
locale.getpreferredencoding()

conf = SparkConf().set('spark.driver.host', '127.0.0.1')
sc = SparkContext(master='local',
                  appName='Video Games rank - Bar graph',
                  conf=conf)
spark = SparkSession.builder.appName(
    "Video Games rank - Bar graph").getOrCreate()

## DATA ##

s = sc.textFile('hdfs://namenode:9000/Video_Games_5.json.gz')
df = spark.read.json(s)

## PROCESS ##

# Printing important informations about the data
df.printSchema()
        sum_xx += ratingX * ratingX
        sum_yy += ratingY * ratingY
        sum_xy += ratingX * ratingY
        numPairs += 1

    numerator = sum_xy
    denominator = sqrt(sum_xx) * sqrt(sum_yy)

    score = 0
    if (denominator):
        score = (numerator / (float(denominator)))

    return (score, numPairs)


conf = SparkConf().setMaster("local[*]").setAppName("MovieSimilarities")
sc = SparkContext(conf=conf)

print("\nLoading movie names...")
nameDict = loadMovieNames()

data = sc.textFile(
    "C:/Users/Andy/Dropbox/FactoryFloor/Repositories/Tutorial_Udemy_SparkPython/Course_Resources/ml-100k/u.data"
)

# Map ratings to key / value pairs: user ID => movie ID, rating
ratings = data.map(lambda l: l.split()).map(
    lambda l: (int(l[0]), (int(l[1]), float(l[2]))))

# Emit every movie rated together by the same user.
# Self-join to find every combination.
Example #33
0
from pyspark import SparkConf, SparkContext

con = SparkConf()
sc = SparkContext(conf=con)

list3 = [1,2,3,5,6,7,9,8,1,2,3,1,2,7,8,9,2,3,4,8,9,10,1,7,8,2,7,9,9]
rdd1 = sc.parallelize(list3)
num = rdd1.count()
data = rdd1.countByValue()
print data
for k in data:
	print (k, "-",data[k])



Example #34
0
# first, import pyspark
from pyspark import SparkConf, SparkContext

# create a conf
conf = SparkConf().setMaster("local").setAppName("FriendsByAge")
sc = SparkContext(conf=conf)


# parse the RDD into key value pairs in tuple which contains integers
def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)


# read the file and create a new RDD after mapping the parseline function
lines = sc.textFile("file:///SparkCourse/fakefriends.csv")
rdd = lines.map(parseLine)
# mapValues does not modify the key, but the values and we use reduceByKey to sum up the values(tuples)
totalsByAge = rdd.mapValues(lambda x: (x, 1)).reduceByKey(
    lambda x, y: (x[0] + y[0], x[1] + y[1]))
averagesByAge = totalsByAge.mapValues(lambda x: round((x[0] / x[1])))
results = averagesByAge.sortByKey().collect()  # in order to print out
for result in results:
    print(result)
# Find popular movies from Ml-100k data

from pyspark import SparkConf, SparkContext

def loadMovieNames():
    movieNames = {}
    with open("ml-100k/u.ITEM") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

conf = SparkConf().setMaster("local").setAppName("PopularMovies")
sc = SparkContext(conf = conf)

nameDict = sc.broadcast(loadMovieNames())

lines = sc.textFile("file:///SparkCourse/ml-100k/u.data")
movies = lines.map(lambda x: (int(x.split()[1]), 1))
movieCounts = movies.reduceByKey(lambda x, y: x + y)

flipped = movieCounts.map( lambda x : (x[1], x[0]))
sortedMovies = flipped.sortByKey()

sortedMoviesWithNames = sortedMovies.map(lambda countMovie : (nameDict.value[countMovie[1]], countMovie[0]))

results = sortedMoviesWithNames.collect()

for result in results:
    print (result)
Example #36
0
import sys
from pyspark import SparkConf
from pyspark.sql.types import *
from pyspark import SparkContext
from csv import reader
import string
import io 
from pyspark.sql import functions
from pyspark.sql.session import SparkSession

conf = SparkConf().setAppName("311")
sc = SparkContext(conf=conf)
inputfile = sc.textFile(sys.argv[1], 1)
header = inputfile.first()
inputfile = inputfile.filter(lambda x: x!=header)
inputfile = inputfile.mapPartitions(lambda x: reader(x))

def extractor(line):
	Date = line[1]                                   
	AgencyName=line[4] 
	ComplaintType=line[5]
	Descriptor=line[6]


	if Descriptor.strip() == '':  
		Descriptor = 'NotSpecified'

	LocationType=line[7]

	if LocationType.strip() == '':  
    if 'microsoft' in tweet.lower():
        return "msft"
    else:
        return "ibm"


if __name__ == '__main__':
    if len(sys.argv) != 4:
        print("Usage: consumer.py <kafka-host> <topic-name> <seconds>")
        exit(-1)

    kafka_host = sys.argv[1]
    topic_name = sys.argv[2]
    seconds = int(sys.argv[3])

    conf = SparkConf() \
        .setAppName("data_challenge")

    from pyspark_cassandra import CassandraSparkContext

    sc = CassandraSparkContext(conf=conf)
    sc.setLogLevel('ERROR')
    ssc = StreamingContext(sc, seconds)
    ssc.checkpoint('./output')

    d = dict()
    d['bootstrap.servers'] = kafka_host
    d['group.id'] = 'test-id'
    d['enable.auto.commit'] = 'false'

    kafka_stream = KafkaUtils.createDirectStream(ssc, [topic_name], d)
Example #38
0
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster('local').setAppName('bfs_degree_of_seperation')
sc = SparkContext(conf=conf)

hit_counter = sc.accumulator(0)


def create_rdd():
    text_file = sc.textFile('./dataset/graph.txt')
    return text_file.map(convert_to_bfs)


def convert_to_bfs(line):
    start_id = 5306
    fields = line.split()
    person_id = int(fields[0])
    connections = []
    for connection in fields[1:]:
        connections.append(int(connection))

    colour = 'WHITE'
    distance = 9999

    if (person_id == start_id):
        colour = 'GRAY'
        distance = 0

    return (person_id, (connections, distance, colour))

#!/usr/bin/env python
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, window, asc, desc, lead, lag, udf, hour, month, dayofmonth
from pyspark.sql.window import Window
from pyspark.sql.types import FloatType, IntegerType, DateType
from pyspark import SparkConf
import yaml
import datetime
import os

conf = SparkConf()
conf.set("spark.jars", os.getenv("HOME") + "/.ivy2/jars/org.postgresql_postgresql-42.1.1.jar")
conf.set("spark.executor.extrajavaoptions", "-Xmx15000m")
conf.set("spark.executor.memory", "15g")
conf.set("spark.driver.memory", "15g")
conf.set("spark.storage.memoryFraction", "0")

spark = SparkSession.builder \
    .config(conf=conf) \
    .master("local") \
    .appName("SAIDI Calculator") \
    .getOrCreate()

config = open('config.yaml')
config = yaml.load(config)

#connect to the database
pw_df = spark.read.jdbc("jdbc:postgresql://timescale.lab11.eecs.umich.edu/powerwatch", "pw_dedupe",
        properties={"user": config['user'], "password": config['password'],"driver":"org.postgresql.Driver"})

#read the data that we care about
Example #40
0
    # 10.4
    task10_4 = context.sql(
        "SELECT comments_score, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/ COUNT(negative_probability) AS neg FROM cumResult GROUP BY comments_score"
    )
    task10_5 = context.sql(
        "SELECT submission_score, SUM(positive_probability)/COUNT(positive_probability) AS pos, SUM(negative_probability)/ COUNT(negative_probability) AS neg FROM cumResult GROUP BY submission_score"
    )
    #    cumResult.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("cumResults.csv")
    task10_1.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_1.csv")
    task10_2.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_2.csv")
    task10_3.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_3.csv")
    task10_4.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_4.csv")
    task10_5.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("task10_5.csv")


if __name__ == "__main__":
    conf = SparkConf().setAppName("CS143 Project 2B")
    conf = conf.setMaster("local[*]")
    sc = SparkContext(conf=conf)
    sc.setLogLevel("ERROR")
    sqlContext = SQLContext(sc)
    sc.addPyFile("cleantext.py")
    comments = sqlContext.read.json("comments-minimal.json.bz2")
    submissions = sqlContext.read.json("submissions.json.bz2")
    main(sqlContext)
Example #41
0
from __future__ import division
from pyspark import SparkContext, SparkConf
from pyspark.mllib.stat import Statistics
from operator import add
import happybase
import csv

conf = SparkConf().setAppName('ListenerSummarizer')
sc = SparkContext(conf=conf)
conn = happybase.Connection('localhost')
ctable = conn.table('/user/mapr/cust_table')
ltable = conn.table('/user/mapr/live_table')
atable = conn.table('/user/mapr/agg_table')
trackfile = sc.textFile('tracks.csv')
clicksfile = sc.textFile('clicks.csv')
trainfile = open('features.txt', 'wb')

def make_tracks_kv(str):
    l = str.split(",")
    return [l[1], [[int(l[2]), l[3], int(l[4]), l[5]]]]

def clicks_summary(str):
    l = str.split(",")
    custid = l[1]
    adv = l[2]
    if (adv == "ADV_REDUCED_1DAY"):
        return (custid, 1)

def compute_stats_byuser(tracks):
    mcount = morn = aft = eve = night = 0
    tracklist = []
Example #42
0
from pyspark import SparkConf, SparkContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF


# Function for printing each element in RDD
def println(x):
    print x


# Boilerplate Spark stuff:
conf = SparkConf().setMaster("local[*]").setAppName("SparkTFIDF")
sc = SparkContext(conf=conf)

# Load documents (one per line).
rawData = sc.textFile("doc-sample.csv")
fields = rawData.map(lambda x: x.split(","))
documents = fields.map(lambda x: x[1].split(" "))

documentId = fields.map(lambda x: x[0])

# Creating Hash table and TF table
hashingTF = HashingTF(100000)
tf = hashingTF.transform(documents)

# Creating idf
tf.cache()
idf = IDF(minDocFreq=1).fit(tf)

# Calculate TF/IDF
tfidf = idf.transform(tf)
import sys
from pyspark import SparkConf, SparkContext

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: spark-submit CountCharacters.py <file>", file=sys.stderr)
        exit(-1)

    spark_conf = SparkConf()
    spark_context = SparkContext(conf=spark_conf)

    logger = spark_context._jvm.org.apache.log4j
    logger.LogManager.getLogger("org").setLevel(logger.Level.WARN)

    lines = spark_context \
        .textFile(sys.argv[1])\
        .persist()

    number_of_as = lines\
        .filter(lambda line: "a" in line)\
        .count()

    number_of_bs = lines\
        .filter(lambda line: "b" in line)\
        .count()

    number_of_cs = lines\
        .filter(lambda line: "c" in line)\
        .saveAsTextFile("Cs.txt")

    print("Number of 'a's: " + str(number_of_as))
Example #44
0
        print "No such MODE parameter as {0}, ending.".format(MODE)
        print ""
        exit(-1)

    print ""
    print "=================================================================="
    print "Main : Starting run at : ", datetime.datetime.now()
    print "Running on ",MyComputer
    print "Getting data from ",data_path
    print "Processing {0} files in batches of {1}".format(stop_after, batch_size)
    print >> runtimes_file
    print >> runtimes_file,"NEW RUN, STARTED AT {0}, STOP_AFTER = {1}".format(datetime.datetime.now(),stop_after)
    print ""

	# Connect to Spark
    config = SparkConf().setMaster('local[2]').set('spark.executor.memory', '4g')
    sc = SparkContext(conf = config, appName = 'SFOX - Coursework INM432')
        
    # ===============================================================================
    # Part 1 - Reading and Preparing text files
    # ===============================================================================
    
    # Part 1d - create TF
    if MODE == 'TF':
        print ""
        print "CREATING TFs"
        print >> runtimes_file,"CREATING TF"
        TF.create_tf(data_path, sc, diag_file, runtimes_file, stop_after, batch_size, report_diagnostics)

    # Part 1e - create IDF
    if MODE == 'IDF':
Example #45
0
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("MaxTemperatures")
sc = SparkContext(conf=conf)


def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0
    print("Temperature is:")
    return (stationID, entryType, temperature)


lines = sc.textFile("file:///SparkCourse/1800.csv")
parsedLines = lines.map(parseLine)
maxTemps = parsedLines.filter(lambda x: "TMAX" in x[1])
stationTemps = maxTemps.map(lambda x: (x[0], x[2]))
maxTemps = stationTemps.reduceByKey(lambda x, y: max(x, y))
results = maxTemps.collect()

for result in results:
    print(result[0] + "\t{:.2f}F".format(result[1]))
stationTemps = maxTemps.flatMap(lambda x: (x[1], x[2]))
maxTemps = stationTemps.reduceByKey(lambda x, y: min(x, y))
results = maxTemps.collect()

resuls.collect()
import sys
import time
import itertools
import os
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark import SparkContext, SparkConf
from operator import  add
from graphframes import *


# os.environ["PYSPARK_SUBMIT_ARGS"] = (
# "--packages graphframes:graphframes:0.6.0-spark2.3-s_2.11")
conf = SparkConf()\
    .setMaster("local[3]")\
    .set("spark.executor.memory", "4g")\
    .set("spark.driver.memory", "4g")


sc = SparkContext(conf=conf)
sqlCtx = SQLContext(sc)
spark = SparkSession(sc)
hasattr(sc, "toDF")
tick = time.time()

#local run
threshold = 7
input_file = "data/HW4/ub_sample_data.csv"
output_file = "data/HW4/task1_output.csv"

# threshold = int(sys.argv[1])
Example #47
0
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext

conf = SparkConf()
conf.setAppName('TestDstream')
conf.setMaster('yarn-cluster')
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 3)

lines = ssc.textFileStream('File:///develop/testdata/')
words = lines.flatMap(lambda x: x.split(' '))
wordCounts = words.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b)

wordCounts.pprint()
ssc.start()
ssc.awaitTermination()
Example #48
0
from pyspark import SparkConf, SparkContext
from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark.sql import SQLContext
# from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.sql.functions import udf
import operator

conf = SparkConf().setAppName('Anomaly Detection')
sc = SparkContext(conf=conf)
sqlCt = SQLContext(sc)


class AnomalyDetection():
    def readData(self, filename):
        self.rawDF = sqlCt.read.parquet(filename).cache()

    def cat2Num(self, df, indices):

        first_feature = df.select(
            df.rawFeatures[indices[0]]).distinct().collect()
        second_feature = df.select(
            df.rawFeatures[indices[1]]).distinct().collect()

        #Creating 2 lists to extract features
        first_feature_lst = []
        second_feature_lst = []

        for row in first_feature:
            first_feature_lst.append(row[0])
Example #49
0
        results_file.write('Frequent Itemsets:\n')
        output = []
        for single_item in sorted(result_frequent_itemsets[0]):
            output.append('(\'' + str(single_item[0]) + '\')')

        results_file.write(','.join(output) + '\n\n')

        for freq_set in result_frequent_itemsets[1:]:
            results_file.write(','.join(map(str, (sorted(freq_set)))) + '\n\n')


if __name__ == '__main__':
    start_time = time.time()

    # initialize spark
    conf = SparkConf()
    conf.set("spark.driver.memory", "4g")
    conf.set("spark.executor.memory", "4g")
    conf.setMaster('local[8]')
    conf.setAppName('Assignment_2')
    sc = SparkContext.getOrCreate(conf)

    # get args
    case = int(sys.argv[1])
    support = int(sys.argv[2])
    input_file = sys.argv[3]
    result_file = sys.argv[4]

    # create baskets rdd
    data = sc.textFile(input_file).map(lambda x: x.split(',')).map(
        lambda x: (x[0], x[1]))
Example #50
0
from pyspark import SparkConf, SparkContext
import csv
conf = SparkConf().setMaster("local").setAppName("PFE")
sc = SparkContext(conf=conf)

# generate labeled dataset
lines = sc.textFile("subjectivity.tff")

champs = lines.map(lambda x: x.split(' '))
results = champs.collect()
labels = []
words = []
for result in results:
    champ0 = result[0]
    len1 = len(champ0)
    label = champ0[5:len1]
    labels.append(label)

    champ2 = result[2]
    len2 = len(champ2)
    word = champ2[6:len2]
    words.append(word)

keys = words
values = labels
dictionary = dict(zip(keys, values))


def check_label(word):
    label = ''
    for key, value in dictionary.items():
Example #51
0
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os

st = datetime.now()
conf = SparkConf().setAppName('PROC_A_SUBJECT_D004031').setMaster(sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#上月末日期
V_DT_LMD = (date(int(etl_date[0:4]), int(etl_date[4:6]), 1) +
            timedelta(-1)).strftime("%Y%m%d")
#10位日期
Example #52
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext

conf = SparkConf()
conf.setAppName("Json Pyspark")

sc = SparkContext(conf=conf)

# Create SQL Context
sqlContext = SQLContext(sc)

# Read Json file using sqlcontext
depJSON = sqlContext.jsonFile(
    "/user/cloudera/output/itversity/pig/departments_data.json/part-m-00000")

for x in depJSON.collect():
    print(x)

# Create A Structure on top of the raw json data
depJSON.registerTempTable("deps")

# Now query the table
depRDD = sqlContext.sql("Select * from deps")
for x in depRDD.collect():
    print(x)

# Another method to convert the raw file into a structure. This is removed Spark 1.3.0 onwards since Dataframe were introduced.
# Also all the methods on SQLContext,HiveContext used to return SchemaRDD in 1.2.1 but from 1.3.0 onwards these functions return DataFrame object.
# From 1.3.0 onwards the SchemaRDD class is completely removed from Spark.
#sqlContext.registerRDDAsTable(depJSON, "departments")
Example #53
0
def init_spark_context():
    conf = SparkConf().setAppName("yelp data recommender")
    sc = SparkContext(conf=conf, pyFiles=['recommender.py', 'app.py'])
    return sc
# -*- coding: utf-8 -*-

from  pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.types import *
import os
import time

import re
if __name__ == "__main__":

    conf = SparkConf()
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # PYSPARK_PYTHON = "C:\\Python27\\python.exe"    #多版本python情况下,需要配置这个变量指定使用哪个版本
    # os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON

    def valid_casedate(strdate):
        '''''判断是否是一个有效的日期字符串'''
        try:
            time.strptime(strdate, "%Y-%m-%d")
            return True
        except:
            return False


    def get_date(j, a):
        d = re.split(ur"年|月|日", j)
        if len(d) == 4:
            if len(d[0]) == 4:
__author__ = 'yjxiong'

from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.types import *

conf = SparkConf().setAppName('build_parquet').setMaster("local[24]")\
    .set('spark.executor.memory','4g')\

sc = SparkContext(conf=conf)
sqlc = SQLContext(sc)

# yfcc_field_list = open('yfcc_schema.txt').read().split()

yfcc_field_list = ['image_id', 'labels']

fields = [
    StructField(field_name, StringType(), True)
    for field_name in yfcc_field_list
]

yfcc_schema = StructType(fields)

# yfcc_data = sc.textFile('/TMP/yjxiong/YFCC/temp_store/yfcc100m_rich_text_eng_dataset_success_images')\
#     .map(lambda x: x.split('\t')).coalesce(2000)

yfcc_data = sc.textFile('/TMP/yjxiong/YFCC/temp_store/yfcc100m_rich_text_eng_dataset_success_tag_ids')\
    .map(lambda x: x.split('\t')).coalesce(10)

yfcc_schema_df = sqlc.createDataFrame(yfcc_data, yfcc_schema)
    return [(query_dataset, single_column_data.to_csv(index=False),
             single_column_joined_data.to_csv(index=False), new_record)]


def save_id_to_record(id_, record, key_name):

    record[key_name] = id_
    return record


if __name__ == '__main__':

    start_time = time.time()

    # Spark context
    conf = SparkConf().setAppName("Data Generation for Use Cases")
    sc = SparkContext(conf=conf)

    # parameters
    params = json.load(open(".params.json"))
    training_records_file = params['training_records']
    datasets_dir = params['datasets_directory']
    output_dir = params['new_datasets_directory']
    hdfs_address = params['hdfs_address']
    hdfs_user = params['hdfs_user']

    # HDFS Client
    hdfs_client = InsecureClient(hdfs_address, user=hdfs_user)

    create_dir(output_dir, hdfs_client)
import numpy as np

from pyspark import SparkContext, SparkConf
from pyspark.mllib.recommendation import Rating
from pyspark.mllib.recommendation import ALS

# For reproducibility
np.random.seed(1000)

nb_users = 200
nb_products = 100
ratings = []

if __name__ == '__main__':
    conf = SparkConf().setAppName('ALS').setMaster('local[*]')
    sc = SparkContext(conf=conf)

    for _ in range(10):
        for i in range(nb_users):
            rating = Rating(user=i,
                            product=np.random.randint(1, nb_products),
                            rating=np.random.randint(0, 5))
            ratings.append(rating)

    # Parallelize the ratings
    ratings = sc.parallelize(ratings)

    # Train the model
    model = ALS.train(ratings, rank=5, iterations=10)

    # Test the model
    f[21] = len(filter(lambda x: x[0] <= 1 and x[1] == 4, items))  # 最后1天购买次数
    f[22] = len(filter(lambda x: x[0] <= 3 and x[1] == 4, items))  # 最后3天购买次数
    f[23] = len(filter(lambda x: x[0] <= 7 and x[1] == 4, items))  # 最后7天购买次数
    f[38] = round(1.0 * len(items) /
                  f[32], 4) if f[32] != 0 else 0.0  # 用户对每件商品的平均交互次数
    return "\t".join([str(i) for i in f])


global etime
global subset

if __name__ == "__main__":
    import fileinput
    conf = (SparkConf().setMaster(
        "spark://namenode.omnilab.sjtu.edu.cn:7077").setAppName("Extract").set(
            "spark.cores.max", "32").set("spark.driver.memory",
                                         "4g").set("spark.executor.memory",
                                                   "6g"))
    sc = SparkContext(conf=conf)
    lines = sc.textFile(
        'hdfs://namenode.omnilab.sjtu.edu.cn/user/qiangsiwei/competition_tianchi/uid_iid',
        1)
    target, etime, subset = "12-19-0", "12-18-23", {}
    # target, etime, subset = "12-18-0", "12-17-23", {}
    # target, etime, subset = "12-17-0", "12-16-23", {}
    # target, etime, subset = "12-16-0", "12-15-23", {}
    # target, etime, subset = "12-15-0", "12-14-23", {}
    # target, etime, subset = "12-14-0", "12-13-23", {}
    # target, etime, subset = "12-13-0", "12-12-23", {}
    # target, etime, subset = "12-12-0", "12-11-23", {}
    # target, etime, subset = "12-11-0", "12-10-23", {}
Example #59
0
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import time, copy, re, math
from datetime import datetime, timedelta, date
import json
import logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

reload(sys)
sys.setdefaultencoding('utf-8')

#warehouse_location = '/user/hive/warehouse/'
conf = SparkConf().set('spark.driver.maxResultSize', '20g')
conf.set('spark.yarn.am.cores', 5)
conf.set('spark.yarn.executor.memoryOverhead', '10g')
conf.set('soark.shuffle.partitions', 800)
conf.set('soark.shuffle.io.retryWait', '10s')
conf.set('spark.executor.memory', '10g')
conf.set('spark.executor.instances', 100)
conf.set('spark.executor.cores', 4)
conf.set('spark.executor.extraJavaOptions',
         '-XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UseG1GC')
#conf.set("spark.sql.warehouse.dir", warehouse_location)

spark = SparkSession \
    .builder \
    .config(conf=conf) \
    .enableHiveSupport() \
Example #60
0
    # print(count)


# Main Execution
# Run Configurations
case_num = int(sys.argv[1])
support = int(sys.argv[2])
input_path = sys.argv[3]
output_path = sys.argv[4]
# Level of Parallelism - Recommended by Spark
# http://spark.apache.org/docs/latest/tuning.html#level-of-parallelism
cpu_num = multiprocessing.cpu_count()
task_per_cpu = cpu_num * 3

# Spark Configurations
conf = SparkConf().setAppName('HW2 - Task 1').setMaster('local[*]')
sc = SparkContext(conf=conf)

# Data Input
distFile = sc.textFile(input_path, minPartitions=2).coalesce(task_per_cpu)
rdd = distFile.map(lambda s: s.split(","))

# SON Algorithm
# Divide into market basket model
headers = rdd.first()
data = rdd.filter(lambda s: s != headers)
grouped_data = get_grouped_data(data, case_num)

num_count = grouped_data.count()
num_part = grouped_data.getNumPartitions()