Example #1
0
def setup():
    ''' setup the spark context and get the logger '''
    # Set the context
    conf = SparkConf()
    conf.setExecutorEnv(key='Auth', value='value', pairs=None)
    sc = SparkContext(conf=conf)
    logger = sc._jvm.org.apache.log4j.LogManager.getLogger(__name__)
    return sc, logger
Example #2
0
File: spark.py Project: daskos/epos
    def closure(*args, **kwargs):
        try:
            options = opts
            options.update({
                'sql_parquet_compression_codec': 'uncompressed',
                'mesos_role': role,
                'mesos_coarse': bool(coarse),
                'cores_max': int(coarse) or None,
                'executor_cores': int(executor_cores),
                'executor_memory': '{}m'.format(int(executor_memory / MiB)),
                'driver_memory': '{}m'.format(int(driver_memory / MiB)),
                'mesos_executor_memoryOverhead': int(
                    (memory_overhead or (executor_cores * python_worker_memory +
                                         0.1 * executor_memory))
                    / MiB),
                'python_worker_memory': int(python_worker_memory / MiB),
                'mesos_uris': ','.join(uris),
                'mesos_executor_docker_image': docker
            })
            options = {'spark.{}'.format(k.replace('_', '.')): str(v)
                       for k, v in options.items() if v not in (None, '')}
            environs = envs.items()
        except TypeError as e:
            # curry doesn't reraise TypeErrors:
            # https://github.com/pytoolz/toolz/issues/288
            raise Exception(repr(e))

        conf = SparkConf()
        conf.setMaster(str(master))
        conf.setAppName(str(name or fn.__name__))
        conf.setAll(pairs=options.items())
        conf.setExecutorEnv(pairs=environs)

        with SparkContext(conf=conf) as sc:
            sc.setLogLevel(str(log))
            map(sc.addFile, files)
            map(sc.addPyFile, pyfiles)
            # TODO: user sparksession
            sql = SQLContext(sc)
            return fn(sc, sql, *args, **kwargs)
Example #3
0
    def init(self,
             name=None,
             master='yarn-client',
             config_parameters=None,
             interpreter_path=None,
             dependent_jars=None,
             pypath='pyspark.zip:py4j-0.10.6-src.zip'):
        from pyspark import SparkConf
        from pyspark.context import SparkContext
        all_env_var_str = ''

        if name is None:
            raise ValueError('Please specify a name for the spark application')
        else:
            name_str = " --name " + name.replace(" ", "")

        if dependent_jars is None:
            dependent_jars = []
        else:
            for dep_jars in dependent_jars:
                self.add_jars(dep_jars)
            driver_jars = ":".join(self.get_jars())
            all_env_var_str = "--driver-class-path " + driver_jars

        if 'spark.driver.memory' in list(config_parameters.keys()):
            d_mem_str = " --driver-memory " + config_parameters[
                'spark.driver.memory'].replace(" ", "")
            all_env_var_str = all_env_var_str + d_mem_str + name_str + " " + self.shell_arg
            os.environ["PYSPARK_SUBMIT_ARGS"] = all_env_var_str.replace(
                "  ", " ")
        else:
            all_env_var_str = all_env_var_str + name_str + " " + self.shell_arg
            os.environ["PYSPARK_SUBMIT_ARGS"] = all_env_var_str.replace(
                "  ", " ")

        if interpreter_path is None:
            interpreter_path = sys.executable
        os.environ["PYSPARK_PYTHON"] = interpreter_path

        conf = SparkConf()
        conf.setMaster(master)
        conf.set('spark.jars', ",".join(self.get_jars()))
        conf.set('spark.jars.packages', ",".join(self.coordinates))
        conf.set('spark.jars', ",".join(self.get_jars()))
        conf.set(
            'spark.yarn.dist.files',
            'file:/usr/hdp/current/spark2-client/python/lib/pyspark.zip,'
            'file:/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip'
        )
        conf.set('spark.sql.codegen.wholeStage', 'false')
        if config_parameters is None:
            config_parameters = {}
        else:
            for option in list(config_parameters.keys()):
                conf.set(option, config_parameters[option].replace(" ", ""))

        if self.spark_version == 2:
            from pyspark.sql import SparkSession
            conf.setExecutorEnv('PYTHONPATH', pypath)
            spark = SparkSession \
                .builder \
                .enableHiveSupport() \
                .appName(name) \
                .config(conf=conf) \
                .getOrCreate()
            sc = spark.sparkContext
            sc.setLogLevel("ERROR")
            # sc.setLogLevel("info")

        else:
            conf.setAppName(name)
            sc = SparkContext.getOrCreate(conf=conf)

        sc.setCheckpointDir("tmp")

        print("Application Name: ", sc.appName)
        print("Application ID: ", sc.applicationId)
        print("Tracking URL: http://{}name3:8088/cluster/app/{}/".format(
            os.uname()[1][:7], sc.applicationId))
        return spark if self.spark_version == 2 else sc
Example #4
0
from pyspark import sql, SparkConf, SparkContext
import pyspark.sql.functions as f
import numpy as np

conf = SparkConf().setAppName('Benchmarks')
conf.setExecutorEnv('spark.executor.memory', '2g')
conf.setExecutorEnv('spark.driver.memory', '30g')
sc = SparkContext(conf=conf)
sqlContext = sql.SQLContext(sc)


def read_file(df=None, data_path=None):
    return sqlContext.read.parquet(data_path)


def mean(df):
    return df.select(f.mean('fare_amount')).collect()


def standard_deviation(df):
    return df.select(f.stddev('fare_amount')).collect()


def sum_columns(df):
    return df.select(f.mean(df['fare_amount'] + df['trip_distance'])).collect()


def product_columns(df):
    return df.select(f.mean(df['fare_amount'] * df['trip_distance'])).collect()

Example #5
0

parser = argparse.ArgumentParser()
parser.add_argument('-a', '--access_key')
parser.add_argument('-s', '--secret_access_key')
parser.add_argument('-l', '--copy_local', action='store_true')

config = parser.parse_args()

download = False;

spark_config = None
if config.access_key and config.secret_access_key:
    download = True
    spark_config = SparkConf()
    spark_config.setExecutorEnv('AWS_ACCESS_KEY_ID', config.access_key)
    spark_config.setExecutorEnv('AWS_SECRET_ACCESS_KEY ', config.secret_access_key)


# Build up the context, using the master URL
sc = SparkContext('spark://ulex:7077', 'mean', conf=spark_config)
local_data_path = '/media/bitbucket/pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc'
data_path = local_data_path
data_url = 'https://nasanex.s3.amazonaws.com/NEX-DCP30/BCSD/rcp26/mon/atmos/pr/r1i1p1/v1.0/CONUS/pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc'

if download:
    data_path = data_url

# Download the file onto each node
if download or config.copy_local:
    sc.addFile(data_path)
Example #6
0
def evaluate(evaluator):
    precise = evaluator.precision()
    coverage = evaluator.coverage()
    popularity = evaluator.popularity()
    recall = evaluator.recall()
    return precise,recall,coverage,popularity

if __name__ == "__main__": 
    import os
    PYSPARK_PYTHON = "/usr/bin/python2.7"
    os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
    conf = SparkConf().setAppName("amazonItemCF").setMaster("yarn-client")
    conf.set("spark.shuffle.file.buffer","128k").set("spark.reducer.maxSizeInFlight","96M")
    conf.set('spark.yarn.dist.files',
            'file:/root/hadoop-2.6/spark/python/lib/pyspark.zip,file:/root/hadoop-2.6/spark/python/lib/py4j-0.10.4-src.zip')
    conf.setExecutorEnv('PYTHONPATH','pyspark.zip:py4j-0.10.4-src.zip')
    conf.set('spark.executor.cores','30')
    conf.set('spark.executor.memory','95g')
    conf.set('spark.executor.instances','4')
    spark = SparkSession.builder\
        .config(conf=conf) \
        .enableHiveSupport()\
        .getOrCreate()
    spark.sql('set spark.sql.broadcastTimeout=30000')
    sc=spark.sparkContext 
    sc.setLogLevel('WARN')

    start = time.time()
    inputPath = "data/amazon/complete_csv"
    schema = StructType([
        StructField("user", StringType(), True),
Example #7
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

# delete the old file and put the new file
import os
os.system("hadoop fs -rm -r -skipTrash /data/ratings_small.csv")
os.system("hdfs dfs -put ratings_small.csv /data")

print("start")

conf = SparkConf()
conf.setMaster("spark://master:7077")
conf.setAppName("recommend_train")
conf.setExecutorEnv(key="executor-memory",value="3g")
conf.setExecutorEnv(key="driver-memory",value="9g")

sc = SparkContext(conf=conf)
#sc = SparkContext("local")

text = sc.textFile("/data/ratings_small.csv")
text=text .filter(lambda x: "movieId" not in x)
movieRatings=text.map(lambda x: x.split(",")[:3])   

print("start counting")
from pyspark.mllib.recommendation import ALS
model = ALS.train(movieRatings, 10, 10, 0.01)  

model.save(sc,"/data/model1")
print(model.recommendProducts(1, 5))
Example #8
0
def create_spark_conf(coreNum, nodeNum):
    print("coreNum:%s,  nodeNum: %s" % (coreNum, nodeNum))
    sparkConf = SparkConf()
    sparkConf.setExecutorEnv("DL_ENGINE_TYPE", "mklblas")
    sparkConf.setExecutorEnv("MKL_DISABLE_FAST_MM", "1")
    sparkConf.setExecutorEnv("KMP_BLOCKTIME", "0")
    sparkConf.setExecutorEnv("OMP_WAIT_POLICY", "passive")
    sparkConf.setExecutorEnv("OMP_NUM_THREADS", "1")
    sparkConf.setExecutorEnv("DL_CORE_NUMBER", str(coreNum))
    sparkConf.setExecutorEnv("DL_NODE_NUMBER", str(nodeNum))
    sparkConf.set("spark.shuffle.blockTransferService", "nio")
    sparkConf.set("spark.scheduler.minRegisteredResourcesRatio", "1.0")
    return sparkConf
Example #9
0
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('-a', '--access_key')
parser.add_argument('-s', '--secret_access_key')
parser.add_argument('-l', '--copy_local', action='store_true')

config = parser.parse_args()

download = False

spark_config = None
if config.access_key and config.secret_access_key:
    download = True
    spark_config = SparkConf()
    spark_config.setExecutorEnv('AWS_ACCESS_KEY_ID', config.access_key)
    spark_config.setExecutorEnv('AWS_SECRET_ACCESS_KEY ',
                                config.secret_access_key)

# Build up the context, using the master URL
sc = SparkContext('spark://ulex:7077', 'mean', conf=spark_config)
local_data_path = '/media/bitbucket/pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc'
data_path = local_data_path
data_url = 'https://nasanex.s3.amazonaws.com/NEX-DCP30/BCSD/rcp26/mon/atmos/pr/r1i1p1/v1.0/CONUS/pr_amon_BCSD_rcp26_r1i1p1_CONUS_bcc-csm1-1_202101-202512.nc'

if download:
    data_path = data_url

# Download the file onto each node
if download or config.copy_local:
    sc.addFile(data_path)
Example #10
0
def main(ST_AUTH, ST_USER, ST_KEY, TASKS, CORES, BLASTN, QUERY_FILE, MODE,
         OBJECT_STORES):
    ''' Main function
        ST_AUTH - Object storage auth string where fna containers are found
        ST_USER - Ojbect storage user token
        ST_KEY - Ojbect storage secret token
        TASKS - Number of tasks to launch, db partition factor
        CORES - Number of cores to devote to each task
        BLASTN - Location of blastn executable
        QUERY_FILE - fasta query file
        MODE - operation mode, 1 = top search, 2 = most common genome
        OBJECT_STORES - list of source containers that built the blast db
    '''
    # Set the context
    conf = SparkConf()
    conf.setExecutorEnv(key='Auth', value='value', pairs=None)
    sc = SparkContext(conf=conf)

    # Quiet the logs
    sc.setLogLevel("WARN")

    N = 5  # number of top results to take

    # Set our spark database creation script and add all the files that are needed to be on the
    # remote hosts to the shall script
    ShellScript = "spark_blast.bash"
    sc.addFile(ShellScript)
    sc.addFile(QUERY_FILE)

    # Copy over blastn if it is local
    if os.path.dirname(BLASTN) == "." or os.path.dirname(BLASTN) == "":
        sc.addFile(BLASTN)

    # Get the file name part of QUERY_FILE
    Query_File = os.path.basename(QUERY_FILE)

    # this will be our root name for our DB names
    container = "blastdb_" + "-".join(sorted(OBJECT_STORES)) + "_" + str(TASKS)

    # log into swift
    conn = swiftclient.Connection(user=ST_USER, key=ST_KEY, authurl=ST_AUTH)

    # Verify the container we need is present
    if container not in [t['name'] for t in conn.get_account()[1]]:
        print("No database parition created for %s partition factor %d" %
              ("+".join(sorted(OBJECT_STORES)), TASKS))
        exit()

    # Get the list of objects we are oing to need
    dbs = {}
    print("Collecting DBs from " + container)
    for data in conn.get_container(container)[1]:
        base = data['name'].split('.', 1)[0]
        if base not in dbs:
            dbs[base] = []
        dbs[base].append(data['name'])

    # Assemble our task list
    files = []
    for db in dbs:
        files.append("%s %s %s %s" %
                     (Query_File, container, db, " ".join(dbs[db])))

    # Distribute our data
    distData = sc.parallelize(files, TASKS)

    options = ""

    # TODO -- these didn't work when I used them, so I commented them out for now - Peter
    # Set our search options
    # if MODE == "1":
    #    options = "-max_target_seqs 1"
    # elif MODE == "2":
    #     options = ??

    # Pass our bash script our parameters, ideally we would like to pass the executor ID/Task ID, but
    # this doesn't appear to be available in ver 2.1.1
    pipeRDD = distData.pipe(
        ShellScript, {
            'ST_AUTH': ST_AUTH,
            'ST_USER': ST_USER,
            'ST_KEY': ST_KEY,
            'THREADS': str(CORES),
            'OPTIONS': options,
            'BLASTN': BLASTN
        })

    # Now let the bash script do its work.  This will run blast using our query file across all the
    # DB partitions searching for matching genomic reads.
    #
    # Failing to fetch me at first keep encouraged,
    # Missing me one place search another,
    # I stop somewhere waiting for you.
    #   -- Walt Whitman - Leaves of Grass: Book 3, Song of Myself, Verse 52
    print("Search through all the DBs for matching sequence")
    if MODE == "1":
        query_count  = pipeRDD.map(lambda x : (x.split(',')[0], (x.split(',')[2], x.split('|')[-1:][0]) )) \
            .reduceByKey( lambda x, y : maxByIndex(x, y, 0)) \
            .sortByKey(True) \
            .map(lambda x : str(x[0]) + ", " + str(x[1][1]))
        for line in query_count.collect():
            print line
    elif MODE == "2":

        specie_count = pipeRDD.map( lambda x : (x.split(',')[11].split(' ', 1)[-1], x.split(',')[0] )) \
            .distinct() \
            .map(lambda x : (x[0], 1)) \
            .reduceByKey(lambda x,y:x+y) \
            .sortByKey(False) \
            .map(lambda x:(str(x[0]) + ", " + str(x[1])))
        for line in specie_count.collect():
            print line
Example #11
0
# -*- encoding: utf-8 -*-
from pyspark import SparkContext, SparkConf
from pyspark import StorageLevel
import random

conf = SparkConf()
conf.setExecutorEnv(
    'PYSPARK_PYTHON',
    '/home/classify/workspace/ENV_material_recommendation/bin/python')
conf.setExecutorEnv(
    'PYTHONPATH',
    '/home/classify/workspace/ENV_material_recommendation/lib/python2.7/site-packages:/home/classify/workspace/material_recommendation'
)

# conf.setExecutorEnv('PYSPARK_PYTHON', '/home/classify/workspace/ENV_qa-helper/bin/python')
# conf.setExecutorEnv('PYTHONPATH',
#                     '/home/classify/workspace/ENV_qa-helper/lib/python2.7/site-packages:/home/classify/workspace/qa-helper')

conf.set("spark.cores.max", "40")
conf.set("spark.scheduler.mode", "FAIR")

sc = SparkContext(conf=conf)

sc.setLogLevel('ERROR')


def custom_zip(rdd1, rdd2, npart=None):
    """
    see http://stackoverflow.com/questions/32084368/can-only-zip-with-rdd-which-has-the-same-number-of-partitions-error
    """
    def prepare(rdd, npart):
import itertools
import time
import subprocess
from commands import *
from pyspark.sql.functions import col
import optparse
import re
import smtplib
from email.MIMEMultipart import MIMEMultipart
from email.MIMEText import MIMEText

appName = "NetflowReplication:QA"
conf = SparkConf().setAppName(appName)

conf.setExecutorEnv(
    'PYTHONPATH',
    '/opt/spark/python:/opt/spark/python/lib/py4j-0.8.2.1-src.zip')

conf.set("spark.driver.maxResultSize", "2g")
sc = SparkContext(conf=conf)
sqlContext = HiveContext(sc)

if len(sys.argv) < 3:
    print "Usage: /opt/spark/bin/spark-submit " + sys.argv[
        0] + " <netflow input path> <file with list of IP addresses to filter> <output filtered netflow text directory>"
    sys.exit()
path = sys.argv[1]
input_ip = sys.argv[2]
output = sys.argv[3]

list = []
Example #13
0
File: hpa.py Project: eagle9/palgo
    'INTERNAL_PROC_ERAB_SETUP': [12, 13, 19, 20],
    'INTERNAL_PROC_INITIAL_CTXT_SETUP' : [12, 13, 20, 21],
    'INTERNAL_PROC_UE_CTXT_RELEASE': [17, 21, 22, 23],
    'INTERNAL_PROC_HO_PREP_S1_IN': [17, 18, 19],
    'INTERNAL_PROC_HO_PREP_X2_IN' : [18, 19, 20],
    'INTERNAL_PROC_RRC_CONN_SETUP': [12, 13],
    'INTERNAL_PROC_S1_SIG_CONN_SETUP' : [13]}

#os.environ['PYSPARK_PYTHON'] = '/usr/bin/python'
#py2.7 timedelta.total_seconds()

#NUM_PARTITIONS = 2000
path = '/user/mfoo/20160318tmp/seqFile.seq'
conf = SparkConf()
conf.set('spark.yarn.dist.files','file:/home/wfoo/install/spark1.4/python/lib/pyspark.zip,file:/home/wfoo/install/spark1.4/python/lib/py4j-0.8.2.1-src.  zip')
conf.setExecutorEnv('PYTHONPATH','pyspark.zip:py4j-0.8.2.1-src.zip')
#conf.set("dynamicAllocation.enabled", "true")
conf.set("spark.yarn.executor.memoryOverhead", 8192)
conf.set("spark.yarn.driver.memoryOverhead", 8192)
#conf.set("spark.executor.memory", "6g")
#conf.set("spark.driver.memory", "6g")
conf.set("spark.rdd.compress", "true")
conf.set("spark.storage.memoryFraction", 1)
conf.set("spark.core.connection.ack.wait.timeout", 600)
conf.set("spark.akka.frameSize", 50)
#conf.set("spark.local.dir","/data1/hadoop")
#conf.set("spark.driver.maxResultSize","32g")
#conf.setMaster("yarn-client")
sc = SparkContext(appName = "hpa_stats", conf=conf)
evt = sc.broadcast(EVENT_NAME)
fld = sc.broadcast(eventFields[EVENT_NAME])
Example #14
0
# sc = SparkContext("local[2]", "data analyse")

# conf = SparkConf().setAppName("data analyse").setMaster('spark://10.0.2.15:7077')
# sc = SparkContext(conf=conf)

# config for SparkContext
SPARK_HOME = os.environ['SPARK_HOME']
conf = SparkConf().setMaster('spark://10.0.2.15:7077').set(
    "spark.executor.memory", "2g").set("spark.cores.max", "65")
site_packages = os.path.abspath((os.path.join(
    os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir),
    os.pardir))) + '/site-packages'
pyspark = SPARK_HOME + "/python" + ":" + SPARK_HOME + "/python/lib/py4j-0.9-src.zip" + ":" + SPARK_HOME + "/python/lib/pyspark.zip"
conf.setExecutorEnv(
    "PYTHONPATH",
    "$PYTHONPATH:" + site_packages + ":" + pyspark + ":" + os.path.abspath(
        os.path.join(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         os.pardir), os.pardir)))

ROOTDIR = '/home/gugugujiawei/projects/data'


@main.route('/')
def index():
    return render_template('index.html')


@main.route('/cls/logistic')
def logistic():
    return render_template('reactjs/index.html')
Example #15
0
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Number of elements in RDD is 8
计算成功!
"""



from pyspark import SparkConf
from pyspark import SparkContext

conf = SparkConf()
conf.setMaster('yarn')
conf.setAppName('spark-yarn')
conf.setExecutorEnv('HADOOP_CONF_DIR','$HADOOP_HOME/etc/hadoop')
conf.setExecutorEnv('YARN_CONF_DIR','$HADOOP_HOME/etc/hadoop')
sc = SparkContext(conf=conf)


def mod(x):
    import numpy as np
        return (x, np.mod(x, 2))
    rdd = sc.parallelize(range(1000)).map(mod).take(10)
    print(rdd)

"""
>>>
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/home/gupengxiang/.local/lib/python3.6/site-packages/pyspark/jars/slf4j-log4j12-1.7.16.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/opt/cloudera/parcels/CDH-5.11.0-1.cdh5.11.0.p0.34/jars/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]