def main():
    sc = SparkContext(appName="MyApp")
    sc.setLogLevel('ERROR')

    # Parse data
    train_labels, train_data = load_data('train.csv')
    dummy_labels, test_data = load_data('test.csv', use_labels=False)

    # Map each data point's label to its features
    train_set = reformatData(train_data, train_labels)
    test_set = reformatData(test_data, dummy_labels)

    # Parallelize the data
    parallelized_train_set = sc.parallelize(train_set)
    parallelized_test_set = sc.parallelize(test_set)

    # Split the data
    trainSet, validationSet = parallelized_train_set.randomSplit([1.0, 0.0], seed=42)

    # Train the models
    decisionTreeModel = DecisionTree.trainClassifier(trainSet, numClasses=5, categoricalFeaturesInfo={},
                                         impurity='gini', maxBins=55, maxDepth=30, minInstancesPerNode=2)

    # Test the model
    testDecisionTree(decisionTreeModel, parallelized_test_set)
def save_data_to_db():
    
    from pyspark import SparkContext, SparkConf
    from pyspark.streaming import StreamingContext

    conf = SparkConf().setMaster("localhost")
    sc = SparkContext("local[*]", "camera_mechine_gen")
    sc.setLogLevel("WARN")

    data_used_by_camera_mechine_gen.drop()
    path = '/3/2014-10-15'


    for station in stations:
        station_info = station_destinations_by_directions.find_one({"station_name":station})
        if station_info == None:
            continue
        
        destinations_by_directions = station_info['destinations_by_directions']
    
    
        full_path = data_dir_path+'v0/'+station+path
        print full_path
        func = map_anlalyser_gen(station, destinations_by_directions)
        
        
        file_data = sc.textFile(full_path).map(pre_process_1).groupByKey().map(func).collect()
        
        for i in sorted(file_data, key=lambda x:x[0]):
            time = i[0]
            C1_by_directions = list(i[1].iteritems())
            
            #print station, time, C1_by_directions
            data_used_by_camera_mechine_gen.insert({'station_name':station, 'time':time, 'C1_by_directions':C1_by_directions})
def main():
    sc = SparkContext(appName="MyApp")
    sc.setLogLevel('ERROR')

    # Parse data
    train_labels, train_data = load_data('train.csv')
    dummy_labels, test_data = load_data('test.csv', use_labels=False)

    # Truncate the last 2 features of the data
    for dataPoint in train_data:
        len = np.size(dataPoint)
        dataPoint = np.delete(dataPoint, [len - 2, len - 1])

    for dataPoint in test_data:
        len = np.size(dataPoint)
        dataPoint = np.delete(dataPoint, [len - 2, len - 1])

    # Map each data point's label to its features
    train_set = reformatData(train_data, train_labels)
    test_set = reformatData(test_data, dummy_labels)

    # Parallelize the data
    parallelized_train_set = sc.parallelize(train_set)
    parallelized_test_set = sc.parallelize(test_set)

    # Split the data
    trainSet, validationSet = parallelized_train_set.randomSplit([0.01, 0.99], seed=42)

    # Train the models
    randomForestModel = RandomForest.trainClassifier(trainSet, numClasses=4, impurity='gini', categoricalFeaturesInfo={},
                                         numTrees=750, seed=42, maxDepth=30, maxBins=32)

    # Test the model
    testRandomForest(randomForestModel, parallelized_test_set)
def create_spark_application(app_name):
  """Creates and returns a Spark & SQL Context."""

  conf = (SparkConf().setAppName(app_name))
  spark_context = SparkContext(conf=conf)
  spark_context.setLogLevel('WARN')
  sql_context = SQLContext(spark_context)

  return (spark_context, sql_context)
Esempio n. 5
0
def spark_context(request):
    """
    Pytest fixture for creating a spark context.
    Args:
        :param request: pytest.FixtureRequest object
    """
    conf = (SparkConf().setMaster("local").setAppName("pyspark-local-testing"))
    sc = SparkContext(conf=conf)
    sc.setLogLevel("ERROR")
    request.addfinalizer(lambda: sc.stop())
    return sc
Esempio n. 6
0
def functionToCreateContext():
    sc = SparkContext("local[*]", "streaming_part")
    sc.setLogLevel("ERROR")
    
    ssc = StreamingContext(sc, 5)
    
    data_from_ticket_mechine = ssc.socketTextStream("localhost", 9999)
    data_from_camera_mechine = ssc.socketTextStream("localhost", 9998)
    
    
    #meat
    data_from_ticket_mechine.map(ticket_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(ticket_mechine_RDD_handler)
    data_from_camera_mechine.map(camera_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(camera_mechine_RDD_handler)
    
    ssc.checkpoint(checkpointDirectory)   # set checkpoint directory
    return ssc
def spark_context(request):
    # If RIAK_HOSTS is not set, use Docker to start a Riak node
    if not os.environ.has_key('RIAK_HOSTS'):
      docker_cli = request.getfuncargvalue('docker_cli')
      host_and_port = get_host_and_port(docker_cli)
      os.environ['RIAK_HOSTS'] = host_and_port
      os.environ['USE_DOCKER'] = 'true'
    # Start new spark context
    conf = SparkConf().setMaster('local[*]').setAppName('pytest-pyspark-local-testing')
    conf.set('spark.riak.connection.host', os.environ['RIAK_HOSTS'])
    conf.set('spark.driver.memory', '4g')
    conf.set('spark.executor.memory', '4g')
    spark_context = SparkContext(conf=conf)
    spark_context.setLogLevel('INFO')
    pyspark_riak.riak_context(spark_context)
    request.addfinalizer(lambda: spark_context.stop())
    return spark_context
Esempio n. 8
0
def save_data_to_db():
    from pyspark import SparkContext, SparkConf
    from pyspark.streaming import StreamingContext

    conf = SparkConf().setMaster("localhost")
    sc = SparkContext("local[*]", "tikcket_mechine_gen")
    sc.setLogLevel("WARN")
    sc.addFile(lib_dir+'/getDistance.py')

    data_used_by_ticket_mechine_gen.drop()
    path = '/3/2014-10-15'
    for s in stations:
        full_path = data_dir_path+'v0/'+s+path
        print full_path
        data_to_save = getDistance.get_one_day_group_by_time(full_path, sc)
        for item in data_to_save:
            data_used_by_ticket_mechine_gen.insert({'station_name':s, 'time':item[0], 'data':item[1]})
Esempio n. 9
0
def test():
    sc = SparkContext(master='local[4]', appName='lda')
    sc.setLogLevel('ERROR')

    def train():
        data = sc.textFile(corpus_filename).map(lambda line: Vectors.dense([float(i) for i in line.strip().split()]))
        corpus = data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
        # print(corpus.take(5))

        lda_model = LDA.train(rdd=corpus, maxIterations=max_iter, seed=seed, checkpointInterval=checkin_point_interval,
                              k=K,
                              optimizer=optimizer, docConcentration=alpha, topicConcentration=beta)
        if os.path.exists('./ldamodel'): __import__('shutil').rmtree('./ldamodel')
        lda_model.save(sc, "./ldamodel")

    # train()

    lda_model = LDAModel.load(sc, "./ldamodel")

    # topic-word分布(未归一化的dist,每列代表一个topic)
    topics = lda_model.topicsMatrix()
    # for tid in range(3):
    #     print('Topic' + str(tid) + ':')
    #     for wid in range(0, lda_model.vocabSize()):
    #         print(' ' + str(topics[wid, tid] / sum(topics[:, tid])))  # 加一个归一化
    #         # print(' ' + str(topics[wid, tid]))

    # topic-word按词序排列分布([词id,按权重从大到小排列], [词在主题上的权重])
    topics_dist = lda_model.describeTopics()
    for tid, topic in enumerate(topics_dist):
        print('Topic' + str(tid) + ':' + '\n', topic)

    # 文档的主题分布(mllib不能,ml才可以)
    # doc_topic = lda_model

    sc.stop()
Esempio n. 10
0
from pyspark import SparkConf, SparkContext
import re

conf = SparkConf().setMaster("spark://192.168.56.100:7077").setAppName("My App")
sc = SparkContext(conf = conf)

sc.setLogLevel('WARN')

p = re.compile('^\d+\.\d+\.\d+\.\d+.*$')

input_file = sc.textFile('/etc/hosts')
hosts = input_file.filter(lambda x: p.match(x))

ips = hosts.map(lambda x: x.split('\t')[0])

print "\nnumber of ips: %d" % ips.count()
print '-'*5

for ip in ips.collect():
        print ip
Esempio n. 11
0
def main():
    "Main function"
    optmgr  = OptionParser()
    opts = optmgr.parser.parse_args()

    # setup spark/sql context to be used for communication with HDFS
    sc = SparkContext(appName="phedex_br")
    if  not opts.yarn:
        sc.setLogLevel("ERROR")
    sqlContext = SQLContext(sc)

    # read given file into RDD
    rdd = sc.textFile(opts.fname).map(lambda line: line.split(","))
    # create a dataframe out of RDD
    pdf = rdd.toDF(headers())
    if  opts.verbose:
        pdf.show()
        print("pdf data type", type(pdf))
        pdf.printSchema()

    # cast columns to correct data types
    ndf = pdf.withColumn("block_bytes_tmp", pdf.block_bytes.cast(DoubleType()))\
             .drop("block_bytes").withColumnRenamed("block_bytes_tmp", "block_bytes")\
             .withColumn("block_files_tmp", pdf.block_files.cast(IntegerType()))\
             .drop("block_files").withColumnRenamed("block_files_tmp", "block_files")\
             .withColumn("br_src_bytes_tmp", pdf.br_src_bytes.cast(DoubleType()))\
             .drop("br_src_bytes").withColumnRenamed("br_src_bytes_tmp", "br_src_bytes")\
             .withColumn("br_src_files_tmp", pdf.br_src_files.cast(IntegerType()))\
             .drop("br_src_files").withColumnRenamed("br_src_files_tmp", "br_src_files")\
             .withColumn("br_dest_bytes_tmp", pdf.br_dest_bytes.cast(DoubleType()))\
             .drop("br_dest_bytes").withColumnRenamed("br_dest_bytes_tmp", "br_dest_bytes")\
             .withColumn("br_dest_files_tmp", pdf.br_dest_files.cast(IntegerType()))\
             .drop("br_dest_files").withColumnRenamed("br_dest_files_tmp", "br_dest_files")\
             .withColumn("br_node_bytes_tmp", pdf.br_node_bytes.cast(DoubleType()))\
             .drop("br_node_bytes").withColumnRenamed("br_node_bytes_tmp", "br_node_bytes")\
             .withColumn("br_node_files_tmp", pdf.br_node_files.cast(IntegerType()))\
             .drop("br_node_files").withColumnRenamed("br_node_files_tmp", "br_node_files")\
             .withColumn("br_xfer_bytes_tmp", pdf.br_xfer_bytes.cast(DoubleType()))\
             .drop("br_xfer_bytes").withColumnRenamed("br_xfer_bytes_tmp", "br_xfer_bytes")\
             .withColumn("br_xfer_files_tmp", pdf.br_xfer_files.cast(IntegerType()))\
             .drop("br_xfer_files").withColumnRenamed("br_xfer_files_tmp", "br_xfer_files")
    # example of aggregation
#    res = ndf.filter("dataset_is_open='y'").groupBy().sum('block_bytes')
#    print("open dataset size", res.collect())

    if  opts.order == 'dataset':
        res = ndf.map(lambda r: ((r.dataset_name, r.node_name), r)).groupByKey().map(lambda g: (g[0], stats(g[1])))
    elif opts.order == 'site' or opts.order == 'node':
        res = ndf.map(lambda r: ((r.node_name, r.dataset_name), r)).groupByKey().map(lambda g: (g[0], stats(g[1])))
    else:
        msg = 'The order key="%s" is not supported' % opts.order
        raise NotImplementedError(msg)

    if  opts.fout:
#        lines = res.map(toCSV)
        lines = res.map(lambda g: (g[0][0],(g[0][1], g[1]))).groupByKey().map(toCSV2)
        lines.saveAsTextFile(opts.fout)
    else:
        count = 0
        print("dataset site nfiles bsize status cust group")
        for item in res.collect():
            pair = item[0]
            dataset = pair[0]
            site = pair[1]
            items = item[1]
            nfiles = items[0]
            bsize = items[1]
            dstatus = items[2]
            cust = items[3]
            group = items[4]
            print('%s %s %s %s %s %s %s' % (dataset, site, nfiles, bsize, dstatus, cust, group))
            count += 1
            if count>10:
                break
        denominator += abs(w)
        numerator += (user_movie_rating_dict[(user_id_1, movie_id_2)] * w)

    if denominator == 0:
        return movie_rating_dict.get(movie_id_1, 3.5)

    rating = numerator / denominator
    if rating < 0.0:
        return 0.0
    elif rating > 5.0:
        return 5.0
    return rating


spark_context = SparkContext(appName='ItemBasedCF', conf=SparkConf())
spark_context.setLogLevel("WARN")

file_path = sys.argv[1]
test_file = sys.argv[2]
similarity_file = sys.argv[3]

test_file = spark_context.textFile(test_file)
# test_file = test_file.coalesce(4)
test_file_header = test_file.first()
testing_data = test_file \
    .filter(lambda line: line != test_file_header) \
    .map(lambda line: user_movie_map(line))\
    .persist()

data = spark_context.textFile(file_path)
header = data.first()
Esempio n. 13
0
from pyspark.sql.functions import col
from pyspark.ml import PipelineModel
from pyspark.sql import SQLContext, SparkSession
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from collections import namedtuple
from functools import reduce
# from pyspark.sql.functions import desc

sc = SparkContext("local[2]", "Streaming App")
pipelineFit = PipelineModel.load("standford_500.logreg.model")

sc.setLogLevel("error")
ssc = StreamingContext(sc, 10)
sqlContext = SQLContext(sc)
#ssc.checkpoint( "file:/home/ubuntu/tweets/checkpoint/")

socket_stream = ssc.socketTextStream(
    "127.0.0.1", 5555)  # Internal ip of  the tweepy streamer

lines = socket_stream.window(20)
#lines.pprint()
fields = ("tweet_text")
Tweet = namedtuple('Tweet', fields)


def getTotalCount():
    if ("totalTweetsCount" not in globals()):
        globals()["totalTweetsCount"] = 0
    return globals()["totalTweetsCount"]
# Inialize parser and parse argument
parser = argparse.ArgumentParser()
parser.add_argument("-input","--input",help="Complete input file path for Dataset ex. hdfs:/CCF/input/example.csv")
parser.add_argument("-output","--output",help="Complete output path for results ex. hdfs:/CCF/output")
parser.add_argument("-partition","--partition",type=int,help="Number of partitions for dataset")
args = parser.parse_args()
partition_number = args.partition
input_file_path = args.input
output_directory = args.output

# Initialize spark-context configuration
conf = SparkConf()
conf.setAppName('pyspark-shell-CCF-v1')

sc = SparkContext(conf=conf)
sc.setLogLevel("WARN")

# Initialize logger
log4jLogger = sc._jvm.org.apache.log4j
LOGGER = log4jLogger.LogManager.getLogger(__name__)

LOGGER.warn("################################")
LOGGER.warn(" Start CCF RDD version 1")
LOGGER.warn("--------------------------------")

# Import as RDD line_by_line
raw_graph = sc.textFile(input_file_path,minPartitions=partition_number)

# CSV transformation -> Separator need to be adapted considering the file format
r = raw_graph.map(lambda x:x.split('\t')).map(lambda x:(x[0],x[1]))
Esempio n. 15
0
    def __init__(self, arglist, _sc = None, _sqlContext = None):
        sc = SparkContext() if _sc is None else _sc
        sqlContext = HiveContext(sc) if _sqlContext is None else _sqlContext

        sc.setLogLevel("ERROR")

        self.sqlContext = sqlContext
        self.sc = sc
        self._jvm = sc._jvm

        from py4j.java_gateway import java_import
        java_import(self._jvm, "org.tresamigos.smv.ColumnHelper")
        java_import(self._jvm, "org.tresamigos.smv.SmvDFHelper")
        java_import(self._jvm, "org.tresamigos.smv.dqm.*")
        java_import(self._jvm, "org.tresamigos.smv.panel.*")
        java_import(self._jvm, "org.tresamigos.smv.python.SmvPythonHelper")
        java_import(self._jvm, "org.tresamigos.smv.SmvRunInfoCollector")

        self.j_smvPyClient = self.create_smv_pyclient(arglist)

        # shortcut is meant for internal use only
        self.j_smvApp = self.j_smvPyClient.j_smvApp()
        self.log = self.j_smvApp.log()

        # AFTER app is available but BEFORE stages,
        # use the dynamically configured app dir to set the source path
        self.prepend_source(self.SRC_PROJECT_PATH)

        # issue #429 set application name from smv config
        sc._conf.setAppName(self.appName())

        # user may choose a port for the callback server
        gw = sc._gateway
        cbsp = self.j_smvPyClient.callbackServerPort()
        cbs_port = cbsp.get() if cbsp.isDefined() else gw._python_proxy_port

        # check wither the port is in-use or not. Try 10 times, if all fail, error out
        check_counter = 0
        while(not check_socket(cbs_port) and check_counter < 10):
            cbs_port += 1
            check_counter += 1

        if (not check_socket(cbs_port)):
            raise SmvRuntimeError("Start Python callback server failed. Port {0}-{1} are all in use".format(cbs_port - check_counter, cbs_port))

        # this was a workaround for py4j 0.8.2.1, shipped with spark
        # 1.5.x, to prevent the callback server from hanging the
        # python, and hence the java, process
        from pyspark.streaming.context import _daemonize_callback_server
        _daemonize_callback_server()

        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
            print("SMV starting Py4j callback server on port {0}".format(cbs_port))
            gw._shutdown_callback_server() # in case another has already started
            gw._start_callback_server(cbs_port)
            gw._python_proxy_port = gw._callback_server.port
            # get the GatewayServer object in JVM by ID
            jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
            # update the port of CallbackClient with real port
            gw.jvm.SmvPythonHelper.updatePythonGatewayPort(jgws, gw._python_proxy_port)

        self.repoFactory = DataSetRepoFactory(self)
        self.j_smvPyClient.registerRepoFactory('Python', self.repoFactory)

        # Initialize DataFrame and Column with helper methods
        smv.helpers.init_helpers()
Esempio n. 16
0
def main(argv):
    assert len(
        argv
    ) == 3, "Script takes 3 arguments <train_file><model_file><cf_type>"

    # Unpack arguments
    train_file, model_file, cf_type = argv

    config = SparkConf().setMaster("local[*]") \
                        .setAppName("Task3train") \
                        .set("spark.executor.memory", "4g") \
                        .set("spark.driver.memory", "4g")

    sc = SparkContext(conf=config).getOrCreate()

    sc.setLogLevel("ERROR")

    if cf_type == "item_based":

        lines = sc.textFile(train_file).map(json.loads).cache()

        business_tokens = lines.map(lambda x: x["business_id"]).distinct(
        ).zipWithIndex().collectAsMap()
        tokens_business = {v: k for k, v in business_tokens.items()}

        rdd = lines.map(lambda  x: (business_tokens[x["business_id"]], (x["user_id"], x["stars"]))) \
                   .groupByKey().filter(lambda x: len(x[1]) >= 3) \
                   .mapValues(dict) \
                   .cache()

        tokens_rdd = rdd.map(lambda x: x[0])

        rdd_dict = rdd.collectAsMap()

        results = tokens_rdd.cartesian(tokens_rdd) \
                               .filter(lambda x: x[0] < x[1]) \
                               .filter(lambda x: get_intersect(rdd_dict[x[0]], rdd_dict[x[1]]) >= 3) \
                               .map(lambda x: ((x[0], x[1]), pearson_correlation(rdd_dict[x[0]], rdd_dict[x[1]]))) \
                               .filter(lambda x: x[1] > 0.0).collect()

        print("Number of candidates --------------> ", len(results))
        with open(model_file, "w+") as file:
            for line in results:
                file.writelines(
                    json.dumps({
                        "b1": tokens_business[line[0][0]],
                        "b2": tokens_business[line[0][1]],
                        "sim": line[1]
                    }) + "\n")
            file.close()
    else:
        lines = sc.textFile(train_file).map(json.loads).cache()

        business_tokens = lines.map(lambda x: x["business_id"]).distinct(
        ).zipWithIndex().collectAsMap()
        tokens_business = {v: k for k, v in business_tokens.items()}

        user_tokens = lines.map(
            lambda x: x["user_id"]).distinct().zipWithIndex().collectAsMap()
        tokens_user = {v: k for k, v in user_tokens.items()}

        business_users = lines.map(lambda x: (business_tokens[x["business_id"]], user_tokens[x["user_id"]])) \
                              .groupByKey() \
                              .filter(lambda x: len(x[1]) >= 3) \
                              .mapValues(list).cache()

        users_business = lines.map(lambda x: (user_tokens[x["user_id"]], (business_tokens[x["business_id"]], x["stars"]))) \
                              .groupByKey() \
                              .filter(lambda x: len(x[1]) >= 3) \
                              .mapValues(dict) \
                              .collectAsMap()

        # MinHash
        hash_funcs = [min_hash_func(i) for i in range(NUM_BUCKETS)]

        hash_rdd = business_users.map(lambda x:
                                      (x[0], get_hash(x[0], hash_funcs)))

        joined_hash_rdd = hash_rdd.join(business_users).partitionBy(
            7, lambda x: hash(x) % 7)

        signature_mat = joined_hash_rdd.map(lambda x: get_user_hash(x[1])) \
                                       .flatMap(lambda x: x) \
                                       .reduceByKey(lambda h1, h2: min_hash(h1, h2))

        lsh_hash_funcs = [lsh_hash(i) for i in range(BANDS)]

        candidates = signature_mat.map(lambda x: (x[0], generate_bands(x[1]))) \
                             .map(group_bands) \
                             .flatMap(lambda x: x) \
                             .groupByKey() \
                             .map(lambda x: lsh(x, lsh_hash_funcs)) \
                             .flatMap(lambda x: x[1]) \
                             .filter(lambda x: len(x) > 1) \
                             .flatMap(lambda pairs: [pair for pair in combinations(pairs, 2)]) \
                             .distinct() \
                             .filter(lambda x: users_business.get(x[0]) != None and users_business.get(x[1]) != None) \
                             .filter(lambda x: get_intersect(users_business[x[0]], users_business[x[1]]) >= 3) \
                             .filter(lambda x: jaccard(users_business[x[0]], users_business[x[1]]) >= 0.01) \
                             .map(lambda x: ((x[0], x[1]), pearson_correlation(users_business[x[0]], users_business[x[1]]))) \
                             .filter(lambda x: x[1] > 0.0).collect()
        print("Number of candidates -----------------> ", len(candidates))
        with open(model_file, "w+") as file:
            for line in candidates:
                file.writelines(
                    json.dumps({
                        "u1": tokens_user[line[0][0]],
                        "u2": tokens_user[line[0][1]],
                        "sim": line[1]
                    }) + "\n")
            file.close()
    parser.add_argument('--N',
                        type=int,
                        default=20,
                        help='Number of partitions')
    parser.add_argument(
        '--solver',
        default='GD',
        choices=['GD', 'LS'],
        help=
        'GD learns β via gradient descent, LS learns β by solving a linear system of equations'
    )

    args = parser.parse_args()

    sc = SparkContext(appName='Parallel Ridge Regression')
    sc.setLogLevel('warn')

    beta = None

    if args.traindata is not None:
        # Train a linear model β from data with regularization parameter λ, and store it in beta
        print('Reading training data from', args.traindata)
        data = readData(args.traindata, sc)
        data = data.repartition(args.N).cache()

        x, y = data.take(1)[0]
        beta0 = np.zeros(len(x))

        if args.solver == 'GD':
            start = time()
            print('Training on data from', args.traindata, 'with λ =',
Esempio n. 18
0
def main():
    global spark

    conf = (SparkConf().setAppName("Enc SNI classification").set(
        "spark.dynamicAllocation.enabled",
        "false").set("spark.task.maxFailures", 128).set(
            "spark.yarn.max.executor.failures",
            128).set("spark.executor.cores",
                     "8").set("spark.executor.memory",
                              "12G").set("spark.executor.instances", "80").set(
                                  "spark.network.timeout", "300").set(
                                      "spark.executorEnv.PYTHON_EGG_CACHE",
                                      "./.python-eggs-cache/").
            set("spark.executorEnv.PYTHON_EGG_DIR", "./.python-eggs/").set(
                "spark.driverEnv.PYTHON_EGG_CACHE",
                "./.python-eggs-cache/").set(
                    "spark.driverEnv.PYTHON_EGG_DIR", "./.python-eggs/").set(
                        "spark.driver.maxResultSize", "1024G").set(
                            "spark.kryoserializer.buffer.max value",
                            "10240G").set("spark.kryoserializer.buffer.max.mb",
                                          "2047"))

    if not DEBUGGER:
        sc = SparkContext(conf=conf)
        sc.setLogLevel("ERROR")
        spark = SparkSession(sc)

    # Load ASNs
    print_box("Computing ASNs")
    asns = {}
    print("Target ASNs:")
    for as_name, as_nb in [
            l.split(",") for l in open(ASN_FILE, "r").read().splitlines()
    ]:
        this_asns = as_nb.split(':')
        asns[as_name] = this_asns
        print("   {}: {}".format(as_name, as_nb))

    # Compute entries and ASs
    training = spark.sparkContext.textFile(LOG_TCP_TRAIN_IN)\
                    .mapPartitions(lambda p: get_tcp_entries(p, TRAIN_HASHING, asns) ).filter(lambda e: e["c_ip"] not in CLIENT_BLACKLIST)
    testing  = spark.sparkContext.textFile(LOG_TCP_TEST_IN )\
                    .mapPartitions(lambda p: get_tcp_entries(p, TEST_HASHING, asns) ).filter(lambda e: e["c_ip"] not in CLIENT_BLACKLIST)

    # Persist and print size
    training.persist(StorageLevel(True, True, False, False, 1))
    testing.persist(StorageLevel(True, True, False, False, 1))
    print("Training log entries:", training.count())
    print("Testing log entries:", testing.count())

    # Start classification
    print_box("Working on classification")
    models = {}
    reports = {}
    for as_name in asns:
        print("Working on : {}".format(as_name))

        # Filter, persist and print size
        this_training = training.filter(lambda e: e["s_asn_name"] == as_name)
        this_testing = testing.filter(lambda e: e["s_asn_name"] == as_name)
        this_training.cache()
        this_testing.cache()

        training_count = this_training.count()
        testing_count = this_testing.count()
        print("    Training set:", training_count)
        print("    Testing set:", testing_count)

        # Proceed only if having data points
        if training_count > 0 and testing_count > 0:

            # Compute target domains
            if BINARY is None:
                print("    Computing Occurrences")
                occurrences = dict(
                    this_training.map(lambda w:
                                      (w["domain"], None)).countByKey())
                target_domains = [
                    k for k, v in sorted(occurrences.items(),
                                         key=operator.itemgetter(1),
                                         reverse=True) if v >= MIN_OCCURENCES
                ]
                if MAX_DOMAINS is not None:
                    target_domains = target_domains[:MAX_DOMAINS]
                target_domains_dict = {
                    d: i
                    for i, d in enumerate(target_domains)
                }
                for d in occurrences:
                    if d in target_domains:
                        print("        ", d, ":", occurrences[d])
            else:
                print("    Using binary classification with target:", BINARY)
                target_domains = [BINARY]
                target_domains_dict = {BINARY: 0}

            if len(target_domains) > 0:
                # Extract features
                print("    Extracting Features")

                training_features = this_training\
                                    .map(lambda w: extract_feature(w, target_domains, target_domains_dict)).toDF()
                testing_features  = this_testing\
                                    .map(lambda w: extract_feature(w, target_domains,target_domains_dict)).toDF()

                # Classify
                if CLASSIFY_SPARK:
                    print("    Classifying")
                    model, training_report, testing_report = classify_spark(
                        training_features, testing_features, target_domains,
                        target_domains_dict)

                else:
                    training_local = training_features.toPandas()
                    testing_local = testing_features.toPandas()

                    if DATASET_DIR is not None:
                        if not os.path.exists(DATASET_DIR):
                            os.makedirs(DATASET_DIR)
                        training_local.to_csv("{}/{}.training.csv".format(
                            DATASET_DIR, as_name),
                                              index=False)
                        testing_local.to_csv("{}/{}.testing.csv".format(
                            DATASET_DIR, as_name),
                                             index=False)

                    print("    Classifying")
                    model, training_report, testing_report = classify_local(
                        training_local, testing_local)

                # Store reports
                report = {
                    "training": training_report,
                    "testing": testing_report
                }
                print("        Macro avg F1:",
                      testing_report["macro avg"]["f1-score"])
                print("        Weighted avg F1:",
                      testing_report["weighted avg"]["f1-score"])
                reports[as_name] = report
                if MODEL_OUT is not None:
                    models[as_name] = model
            else:
                print("    Skipping as no domain has minimum occurrences")
                reports[as_name] = {
                    "error": "no domain has minimum occurrences"
                }
                models[as_name] = {
                    "error": "no domain has minimum occurrences"
                }
        else:
            print("    Skipping as empty")
            reports[as_name] = {"error": "empty dataset"}
            if MODEL_OUT is not None:
                models[as_name] = {"error": "empty dataset"}
        gc.collect()

    # Save results on disk
    if REPORT_OUT is not None:
        json.dump(reports, open(REPORT_OUT, "w"), indent=4)
    if MODEL_OUT is not None and not CLASSIFY_SPARK:
        pickle.dump(models, open(MODEL_OUT, "wb"))

    if DEBUGGER:
        pdb.set_trace()
###### This is a script to use Spark Streaming to consume the sessions data stream from Kafka.
#####################################################

from pyspark import SparkContext
from pyspark.streaming import StreamingContext, StreamingListener
from pyspark.streaming.kafka import KafkaUtils, TopicAndPartition
import yaml
import redis
import psycopg2, datetime

with open("config.yml", 'r') as ymlfile:
    config = yaml.load(ymlfile)

# Create Spark Streaming Context
sc = SparkContext(appName="consuming_sessions")
sc.setLogLevel("Error")
ssc = StreamingContext(sc, 2)

# Connect to Kafka and split each message to list of strings
topic = "sessions"

sessionStream = KafkaUtils.createDirectStream(
    ssc, [topic], {"metadata.broker.list": config['broker_list']})
# add {'auto.offset.reset':'smallest'} to read from beginning

lines = sessionStream.map(lambda x: x[1])
lines_list = lines.map(lambda line: line[:-1].split("\t"))

########### Filtering sessions stream and calculate the metrics ###############

Esempio n. 20
0
from pyspark import SparkContext
from pyspark.sql import *
import pyspark.sql.functions as f

sc = SparkContext('local', 'app')
sc.setLogLevel('OFF')
spark = SparkSession(sc)

spark.readStream \
    .format('socket') \
    .option('host', 'localhost') \
    .option('port', 9999) \
    .load() \
    .agg(f.collect_list(f.col('value')).alias('list')) \
    .select(f.explode(f.col('list'))) \
    .writeStream \
    .format('console') \
    .outputMode('complete') \
    .option('checkpointLocation', 'data/checkpoint') \
    .start() \
    .awaitTermination()

# $ ncat -lk 9999
#
# first
# second
# third

# -------------------------------------------
# Batch: 0
# -------------------------------------------
Esempio n. 21
0
		print("\n Example")
		print("spark-submit --packages graphframes:graphframes:0.5.0-spark1.6-s_2.10 "
			  "3-calculate-network-metrics.py "
			  "/home/madis/IR/thesis/parquets/bipartite-sku-only-sku-matches "
			  "network-metrics")
		sys.exit(1)
	else:
		bipartite_location = sys.argv[1]
		output_location = sys.argv[2]

	s_conf = SparkConf()
	s_conf.set("spark.executor.instances", "4")
	s_conf.set("spark.executor.memory", "2g")
	s_conf.set("spark.driver.memory", "2g")
	spark = SparkContext(conf=s_conf)
	spark.setLogLevel("ERROR")

	# needed here if running locally, in cluster it can be with other imports
	import graphframes

	sqlContext = SQLContext(spark)
	# needed for connected components
	# an existing directory!
	spark.setCheckpointDir("tmp/checkpoint")

	graph = build_graph(bipartite_location)

	dfs = find_subgraphs(graph)

	vertices_out = network_algorithms(graph, dfs)
Esempio n. 22
0
__author__ = "ResearchInMotion"

import findspark
findspark.init()
from pyspark import SparkConf
from pyspark import SparkContext

sparkconf = SparkConf().setAppName("WordCount").setMaster("local[*]")
sparkcont = SparkContext(conf=sparkconf)
sparkcont.setLogLevel("ERROR")


def columns(lines):
    field = lines.split(",")
    country = field[3]
    name = field[2]
    return name, country


data = sparkcont.textFile(
    "/Users/sahilnagpal/PycharmProjects/Python/Pyspark/InputData/airports.text"
)
columnsdata = data.map(columns).filter(
    lambda country: country[1] == "\"United States\"").take(5)

for name, country in columnsdata:
    print("{}  , {} ".format(name, country))
Esempio n. 23
0
def main():
    conf = SparkConf().setMaster("local[*]").setAppName("compare_engine")

    sc = SparkContext(conf=conf)
    sc.setLogLevel("INFO")

    sc.addFile(primary)

    # rdd_primary = sc.textFile(primary, minPartitions=4, use_unicode=True).distinct()
    rdd_primary = sc.textFile(SparkFiles.get(primary),
                              minPartitions=4,
                              use_unicode=True).distinct()
    rdd_primary.partitionBy(10).cache()

    os.system("rm -Rf collects_*")
    os.system("rm -Rf holder.txt")

    rdd_secondary = sc.textFile(secondary, minPartitions=4,
                                use_unicode=True).distinct()
    rdd_secondary.partitionBy(10).cache()

    primary_count = rdd_primary.count()
    primary_report["count"] = primary_count
    print(primary_report)

    secondary_count = rdd_secondary.count()
    secondary_report["count"] = secondary_count
    print(secondary_report)

    # Return each Primary file line/record not contained in Secondary
    not_in_primary = rdd_primary.subtract(rdd_secondary)
    primary_diff = not_in_primary.count()
    primary_report["diff"] = primary_diff

    os.system("rm -Rf collects_*.csv")

    primary_dir = "collects_{}_primary".format(run_date)
    primary_report_name = "collects_{}_primary_report.csv".format(run_date)

    not_in_primary.coalesce(1, True).saveAsTextFile(primary_dir)

    # os.system('cat collects_{}_primary/part-0000* >> collects_{}_primary_report.csv'.format(run_date, run_date))
    os.system("cat {}/part-0000* >> {}".format(primary_dir,
                                               primary_report_name))
    os.system("wc -l collects_{}_primary_report.csv".format(run_date))

    # Flip Primary Vs Secondary
    # Return each Secondary file line/record not contained in Primary
    not_in_secondary = rdd_secondary.subtract(rdd_primary)
    secondary_diff = not_in_secondary.count()
    secondary_report["diff"] = secondary_diff

    not_in_secondary.coalesce(1, True).saveAsTextFile(
        "collects_{}_secondary".format(run_date))
    os.system(
        "cat collects_{}_secondary/part-0000* >> collects_{}_secondary_report.csv"
        .format(run_date, run_date))
    os.system("wc -l collects_{}_secondary_report.csv".format(run_date))

    process_report["primary"] = primary_report
    process_report["secondary"] = secondary_report

    print("=" * 100)
    print("\n")
    print(process_report)
    print("\n")
    print("=" * 100)
    spark_details(sc)

    sc.stop()
    return predictions


def append_key_to_dictionary(dictionary, key, value):
    dictionary[key] = value
    return dictionary

def insert_into_table(values, table_name, host, port):
    pass



if __name__ == '__main__':

    sc = SparkContext(appName='PythonSparkStreamingKafka')
    sc.setLogLevel("WARN")  # avoid printing logs

    # setting up a model
    lr = StreamingLogisticRegressionWithSGD()
    parameters = json.load(open('model.json', 'r'))
    # lr.setInitialWeights(parameters['weights'])
    lr = create_logistic_regression_skl(parameters['weights'], parameters['intercept'])
    stop_words = load_stopwords()
    common_words = load_common_words()
    reference_table = create_hash_table(common_words=common_words, stop_words=stop_words)

    ssc = StreamingContext(sparkContext=sc, batchDuration=2)
    spark_sql = SQLContext(sparkContext=sc)

    kafkaStream = KafkaUtils.createDirectStream(ssc=ssc,
                                                topics=['trump'],
Esempio n. 25
0
class Reader():
    def __init__(self):
        self.sc = SparkContext('local', 'Stream-SQL')
        self.ssc = StreamingContext(self.sc, batchDuration=3)
        self.spark = SparkSession.builder\
            .getOrCreate()
        self.sc.setLogLevel('ERROR')

    def initStream(self):
        self.readInput()

        self.ssc.start()
        self.ssc.awaitTermination()

    def inputSQLQuery(self, query):
        self.modQuery = ''
        self.dictInnerQuery = {}

        innerFlag = False
        innerCol = ''
        wordList = query.split(' ')
        wordQuery = ''

        for i in range(len(wordList)):
            word = wordList[i]

            # Detect opening '(' of inner query
            if word == '(SELECT':
                innerFlag = True
                innerCol = wordList[i - 2]

            if innerFlag:
                wordQuery += word + ' '
            else:
                self.modQuery += word + ' '

            # Detect closing ')' of table) and not AVG(col)
            if ')' in word and '(' not in word:
                replaceInner = 'Q' + str(len(self.dictInnerQuery))
                self.modQuery += replaceInner + ' '
                key = replaceInner
                value = [wordQuery, innerCol, 0]
                self.dictInnerQuery[key] = value

                innerFlag = False
                wordQuery = ''

    def readInput(self):
        lines = self.ssc.textFileStream('Data/Live')

        self.csvSchema = StructType([
            StructField('col1', IntegerType()),
            StructField('col2', IntegerType()),
            StructField('col3', IntegerType())
        ])

        # self.stateDF = self.spark.createDataFrame(self.sc.emptyRDD(), self.csvSchema)
        # self.stateDF.show()
        self.globalDF = self.spark.createDataFrame(self.sc.emptyRDD(),
                                                   self.csvSchema)

        self.totalTime = 0.0

        def row(inpStr):
            return Row(int(inpStr[0]), int(inpStr[1]), int(inpStr[2]))

        def iterateRDD(rdd):
            start = time.clock()
            data = rdd.map(lambda line: line.split(' ')).map(row)
            df = data.toDF(self.csvSchema)

            if df.count():
                # curDF = df.union(self.stateDF)
                # self.queryRDD(curDF)

                # Append to global DF for batch outputs
                self.globalDF = df.union(self.globalDF)

                self.outputQuery(self.globalDF)
                self.totalTime += time.clock() - start
                # print(str(round(self.totalTime, 2)) + 's')

        lines.foreachRDD(iterateRDD)

    def queryRDD(self, df):
        df.createOrReplaceTempView('table')

        for key, value in self.dictInnerQuery.items():
            innerQuery = value[0]
            sqlDF = self.spark.sql(innerQuery)
            sqlRes = sqlDF.first()[0]
            self.dictInnerQuery[key][2] = sqlRes

        b = 5
        addToState = [False for i in range(df.count())]
        for key, value in self.dictInnerQuery.items():
            col = value[1]
            val = value[2]
            tupleList = [{col: x[col]} for x in df.rdd.collect()]
            for i in range(len(tupleList)):
                row = tupleList[i]
                if row[col] > val - b and row[col] < val + b:
                    addToState[i] = True

        # print(addToState)
        itr = 0
        newRows = []
        newStateDF = self.spark.createDataFrame(self.sc.emptyRDD(),
                                                self.csvSchema)
        for row in df.rdd.collect():
            if addToState[itr]:
                newRows.append(row)
            itr += 1
        # print(newRows)
        newStateDF = self.spark.createDataFrame(newRows, self.csvSchema)
        self.stateDF = newStateDF
        # self.stateDF.show()

    def outputQuery(self, df):
        # curQuery = ' '.join(list(map((lambda word: str(round(self.dictInnerQuery[word][2], 2)) if word in self.dictInnerQuery else word), self.modQuery.split())))
        # df.createOrReplaceTempView('table')
        # streamOut = self.spark.sql(curQuery).first()[0]
        # print(type(streamOut))

        # self.globalDF.show()
        query = 'SELECT AVG(col2) FROM table WHERE col2 > (SELECT AVG(col2) FROM table)'
        self.globalDF.createOrReplaceTempView('table')
        globalOut = self.spark.sql(query).first()[0]
        # print(type(globalOut))
        print(globalOut)
        # hourly distribution of time gap with resolution 5 seconds
        # threshold: time interval larger than threshold will be discarded
        # time interval span two consecutive hours is taken into consideration
        'hourly_time_gap_distribution': {'index': 1,
                                         'return_std_range': 3,
                                         'resolution': 5,
                                         'threshold': 3600}
    }
    '''
    process_properties = {}

    # ------------------------------------------------------------------------------------------------------------------
    # ------------------------Spark Context-----------------------------------------------------------------------------
    sc = SparkContext(appName='Data_Analysis')
    sc.setLogLevel('DEBUG')

    # ------------------------------------------------------------------------------------------------------------------
    # ------------------------Prepare Raw RDD---------------------------------------------------------------------------
    raw_rdd = load_raw_data(sc, input_path, process_properties['delimiter'])

    '''
    If you want to add more columns or do some manipulation to the raw dataset loaded from hdfs, please put the code
    here.
    1. Make sure that the tansformed rdd is re-assiend to variabel raw_rdd.
    2. Make sure the data dictionary and the column number in the configuration section match the new dataset
    '''

    # ------------------------------------------------------------------------------------------------------------------
    # ------------------------Main Process------------------------------------------------------------------------------
    main_process(raw_rdd, sc, output_path, data_dict, process_properties)
Esempio n. 27
0
from pyspark import SparkContext, SparkConf, StorageLevel
from pyspark.sql import Row, SQLContext
import re
from operator import add

if __name__ == '__main__':
    conf = SparkConf().setAppName("logReader").setMaster("local[1]")
    conf.set("mytest.sql.crossJoin.enabled", True)
    conf.set("mytest.sql.shuffle.partitions", 5)
    conf.set("mytest.defalut.parallelism", 2)

    sc = SparkContext(conf=conf)
    sc.setLogLevel("INFO")

    input = ["A", "C"]

    str = "A:B,C,D,E,F;B:A,C,D,E;C:A,B,E;D:A,B,E;"
    #
    # rdd = sc.parallelize(re.split(";", str)[0:-1]).map(
    #     lambda kv: (re.split(":", kv)[0], re.split(":", kv)[1])). \
    #     filter(lambda kv: kv[0] in input).map(lambda kv: kv[1])

    keys = sc.parallelize(re.split(";", str)[0:-1]).map(
        lambda kv: (re.split(":", kv)[0], re.split(":", kv)[1].split(","))). \
        filter(lambda kv: kv[0] in input).flatMapValues(lambda x: x).map(lambda x: (x[1], 1)).reduceByKey(add).filter(
        lambda x: x[1] > 1).keys().collect()

    print(",".join(keys))

    # rdd1=rdd.map(lambda v: re.split(",", v)).collect()
    # print(rdd1)
Esempio n. 28
0

if __name__ == '__main__':

    sc_conf = SparkConf()
    sc_conf.setAppName('ps_consumer')  # pyspark consumer
    sc_conf.setMaster('local[*]')
    # sc_conf.set('spark.executor.memory', '2g')
    # sc_conf.set('spark.executor.cores', '4')
    # sc_conf.set('spark.cores.max', '40')
    sc_conf.set('spark.logConf', True)
    sc_conf.set('spark.io.compression.codec', 'snappy')

    # sc = SparkContext(master='local[*]', appName='ps_consumer')
    sc = SparkContext(conf=sc_conf)
    sc.setLogLevel('INFO')
    # print(sc)

    ssc = StreamingContext(sc, 5)
    # print(ssc)

    topic = 'firewall'
    partition = 0
    kafka_param = {
        "metadata.broker.list": 'localhost:9092',
        "auto.offset.reset": "smallest",
        "group.id": 'mygroup',
    }
    # topicPartion = TopicAndPartition(topic, partition)
    # fromOffsets = {topicPartion: 500}
    # stream = KafkaUtils.createDirectStream(
Esempio n. 29
0
def init_spark():
    sc = SparkContext(appName="videoStreamCollector")
    ssc = StreamingContext(sc, config.Config.BATCH_DURATION)
    sc.setLogLevel("WARN")
    return sc, ssc
Esempio n. 30
0
        d = json.loads(d)
        city = d["city"].encode("utf-8")
        list_of_cities.append(city)

    gt = len(set(list_of_cities))
    median = get_estimate(list_of_cities)
    # print(str(time),gt,median)
    with open(output_file, "a") as fp:
        writer = csv.writer(fp)
        writer.writerow([time, gt, median])
    return


port_no = int(sys.argv[1])
output_file = sys.argv[2]
num_hash_functions = 45

sc = SparkContext(appName="task2")
sc.setLogLevel("OFF")
ssc = StreamingContext(sc, 5)

with open(output_file, "w") as fp:
    writer = csv.writer(fp)
    writer.writerow(["Time", "Ground Truth", "Estimation"])

hash_list = generate_hash_functions(num_hash_functions)
lines = ssc.socketTextStream("localhost",
                             port_no).window(30, 10).foreachRDD(sample)

ssc.start()  # Start the computation
ssc.awaitTermination()  # Wait for the computation to terminate
Esempio n. 31
0
def functionToCreateContext():
    sc = SparkContext('local[2]', 'checkpoint')
    sc.setLogLevel('ERROR')
    ssc = StreamingContext(sc, 5)
    ssc.checkpoint('/tmp/checkpointDirectory')  # set checkpoint directory
    return ssc
Esempio n. 32
0
#coding=UTF-8
from pyspark import SparkContext, SparkConf, SQLContext, Row, HiveContext
from pyspark.sql.types import *
from datetime import date, datetime, timedelta
import sys, re, os
import sys, re, os, calendar

st = datetime.now()
conf = SparkConf().setAppName('PROC_A_R_MONTH_CONT').setMaster(sys.argv[2])
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')
if len(sys.argv) > 5:
    if sys.argv[5] == "hive":
        sqlContext = HiveContext(sc)
else:
    sqlContext = SQLContext(sc)
hdfs = sys.argv[3]
dbname = sys.argv[4]

#处理需要使用的日期
etl_date = sys.argv[1]
#etl日期
V_DT = etl_date
#上一日日期
V_DT_LD = (date(int(etl_date[0:4]), int(etl_date[4:6]), int(etl_date[6:8])) +
           timedelta(-1)).strftime("%Y%m%d")
#月初日期
V_DT_FMD = date(int(etl_date[0:4]), int(etl_date[4:6]), 1).strftime("%Y%m%d")
#本月月末日期(10位)
monthRange = calendar.monthrange(int(etl_date[0:3]),
                                 int(etl_date[4:6]))  #得到本月的天数
Esempio n. 33
0
def process(fileName, output):
    lines = open(file=fileName, mode="r").readlines()
    data_list = []
    for line in lines:
        if line is None or line.strip().__len__() == 0:
            continue
        for data in json.loads(line):
            title = data['title'].replace('None', '')
            text = data['text'].replace('None', '')
            tmp = ""
            if title is not None or title.strip() != "None":
                tmp = tmp + title.strip() + '.'
            if text is None or text.strip() == "None":
                continue
            tmp = tmp + text.strip()
            tmp.replace("\\u00", '').replace("\n", "")
            data_list.append(tmp)
            # print("$$$$$$$News is:",tmp)
    conf = SparkConf().setAppName("Train_News").setMaster("local[1]")
    sc = SparkContext(conf=conf)
    sc.setLogLevel("WARN")
    dataRdd = sc.parallelize(data_list).distinct(numPartitions=1)
    tokenizedRdd = dataRdd.map(lambda x: [x, preprocess(x)])
    taggedRdd = tokenizedRdd.map(
        lambda x: [x[0], x[1],
                   getNamedEntities(x[0]),
                   extractSignature(x[0])])
    prepared_data = taggedRdd.collect()

    result = []
    i = 0
    for data1 in prepared_data:
        print('Processed data:', i)
        i = i + 1
        for data2 in prepared_data:
            if data1 == data2:
                continue
            nn_count = 0
            spot_count = 0
            nn_score = 0
            lav_d_count = 0
            lav_d_score = 0
            spot_score = 0

            if len(data1[2]) != 0:
                for nn in data1[2]:
                    n = len(nn)
                    if nn in data2[2]:
                        nn_count = nn_count + 1
                    for tmp in data2[2]:
                        l = len(tmp)
                        if lav_distance(nn, tmp) / (n + l) < 0.25:
                            lav_d_count = lav_d_count + 1
                nn_score = nn_count / len(data1[2])
                lav_d_score = lav_d_count / len(data1[2])
            if len(data1[3]) != 0:
                for nn in data1[3]:
                    if nn in data2[3]:
                        spot_count = spot_count + 1
                spot_score = spot_count / len(data1[3])
            result.append([
                data1[0], data1[2], data1[3], data2[0], data2[2], data2[3],
                nn_count, nn_score, lav_d_count, lav_d_score, spot_count,
                spot_score
            ])
    df = pd.DataFrame(
        data=result,
        columns=[
            "News1", "Names_Entities_1", "Spot_words_1", "News2",
            "Names_Entities_2", "Spot_words_2", "Named_Entity_match_count",
            "Named_Entity_match_score", "Laven_Named_Entity_match_score",
            "Laven_Named_Entity_match_count", "Spot_Words_Match_score",
            "Spot_Words_Match_Score"
        ])
    time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S').replace(
        " ", "_").replace(":", "_")
    df.query('Named_Entity_match_score > 0 or Spot_Words_Match_Score > 0.25'
             ).to_csv(output + time + ".csv")
    print('Report File Saved')
Esempio n. 34
0
from pyspark import SparkContext
"""
nasa_19950701.tsv contains 10000 log lines from one of NASA's apache server for
1st July 1995.
nasa_19950801.tsv contains 10000 log lines for 1st Aug 1995.
Create a spark program to generate a new RDD which contains the log lines
from both July 1st & Aug 1st, take 0.1 sample of those log lines and save it
to "out/sample_nasa_logs.tsv"

The file has header lines:
host logname time method url response bytes

Make sure the head lines are removed in the resulting RDD
"""

sc = SparkContext("local[2]", "NASA log Problem")
sc.setLogLevel("ERROR")

nasa_july = sc.textFile("./in/nasa_19950701.tsv")
nasa_aug = sc.textFile("./in/nasa_19950801.tsv")

nasa_both = nasa_july.union(nasa_aug)

nasa_both_without_header = nasa_both.filter(
    lambda line: not (line.startswith("host") and "bytes" in line))

nasa_both_sample = nasa_both_without_header.sample(withReplacement=True,
                                                   fraction=0.1)

nasa_both_sample.saveAsTextFile("out/sample_nasa_logs.csv")
Esempio n. 35
0
from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
from pyspark.sql import SQLContext
from pyspark import SparkContext

sc = SparkContext(appName="ML Example")
sc.setLogLevel("FATAL")
sqlContext = SQLContext(sc)

# Prepare training data from a list of (label, features) tuples.
training = sqlContext.createDataFrame([
    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)

# Since model1 is a Model (i.e., a transformer produced by an Estimator),
# we can view the parameters it used during fit().
# This prints the parameter (name: value) pairs, where names are unique IDs for this
# LogisticRegression instance.
print("Model 1 was fit using parameters: ")
print(model1.extractParamMap())
def run():
    # Creating the Spark Context
    sc = SparkContext(master="local[2]", appName="WindowWordCount")
    sc.setLogLevel("ERROR")

    # creating the streaming context
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")

    # creating the SQL context
    sqlContext = SQLContext(sc)

    host = "localhost"
    port = 5599

    lines = ssc.socketTextStream(host, port)

    hashtags = lines.filter(lambda text: len(text) > 0) \
        .flatMap(lambda text: text.split(" ")) \
        .filter(lambda text: text.lower().startswith('#'))

    Word = namedtuple('Word', ("word", "count"))
    Hashtag = namedtuple('Hashtag', ("tag", "count"))
    Tweet = namedtuple('Tweet', ('text', 'sentiment'))

    stop_words = set(stopwords.words('english'))
    list_punct = list(string.punctuation)
    lemmatizer = WordNetLemmatizer()

    # processing to obtain data about tweets text and sentiment
    lines.window(40) \
        .map(lambda p: clean_tweet(p)) \
        .filter(lambda text: len(text) > 0) \
        .map(lambda p: Tweet(p, analyze_sentiment_polarity(p))) \
        .foreachRDD(lambda rdd: rdd.toDF().registerTempTable("tweets"))

    # processing to obtain data about single words in text and their count. NLP tools applied.
    lines.window(40) \
        .map(lambda p: clean_tweet(p)) \
        .filter(lambda text: len(text) > 0) \
        .flatMap(lambda text: text.split(" ")) \
        .map(lambda word: word.lower()) \
        .filter(lambda word: word not in stop_words) \
        .map(lambda word: ''.join(char for char in word if char not in list_punct)) \
        .map(lambda word: lemmatizer.lemmatize(word)) \
        .map(lambda word: (word, 1)) \
        .reduceByKey(lambda a, b: a + b) \
        .map(lambda p: Word(p[0], p[1])) \
        .foreachRDD(lambda rdd: rdd.toDF().registerTempTable("words"))

    # processing to obtain data about hashtags in text and their count.
    hashtags.window(40) \
        .map(lambda word: ''.join(char for char in word if char not in list_punct)) \
        .map(lambda word: (word.lower(), 1)) \
        .reduceByKey(lambda a, b: a + b) \
        .map(lambda p: Hashtag(p[0], p[1])) \
        .foreachRDD(lambda rdd: rdd.toDF().registerTempTable("hashtags"))

    time_to_wait = 80
    ssc.start()
    print("Session Started.....")
    print("Collecting tweets...waiting for " + str(time_to_wait) +
          " seconds..")
    time.sleep(
        time_to_wait)  # waiting in to ensure that some data are yet collected.
    print("Tweets Collected....")

    all_hashtags_df = None
    all_tweets_df = None
    all_words_df = None

    count = 1
    count_max = 4
    while count <= count_max:
        print('Count: ' + str(count) + "/" + str(count_max))
        print("Waiting for 30 Seconds.....")
        time.sleep(40)

        words = sqlContext.sql('Select word, count from words')
        words_df = words.toPandas()
        print(words_df)
        if all_words_df is None:
            all_words_df = words_df
        else:
            all_words_df = pd.concat([all_words_df, words_df],
                                     join='inner',
                                     ignore_index=True)

        tags = sqlContext.sql('Select tag, count from hashtags')
        tags_df = tags.toPandas()
        print(tags_df)
        if all_hashtags_df is None:
            all_hashtags_df = tags_df
        else:
            all_hashtags_df = pd.concat([all_hashtags_df, tags_df],
                                        join='inner',
                                        ignore_index=True)

        tweets = sqlContext.sql('Select text, sentiment from tweets')
        tweets_df = tweets.toPandas()
        if all_tweets_df is None:
            all_tweets_df = tweets_df
        else:
            all_tweets_df = pd.concat([all_tweets_df, tweets_df],
                                      join='inner',
                                      ignore_index=True)

        count += 1

    ssc.stop()

    # Saving all dataframes as csv.
    if all_hashtags_df is not None:
        all_hashtags_df.to_csv('hashtags.csv')
    if all_words_df is not None:
        all_words_df.to_csv('words.csv')
    if all_tweets_df is not None:
        all_tweets_df.to_csv('tweets.csv')
Esempio n. 37
0
def main():
    conf = (SparkConf()
                .setMaster("local[*]")
                .setAppName("compare_engine"))
                
    sc = SparkContext(conf = conf)
    sc.setLogLevel('INFO')

    sc.addFile(primary)

    # rdd_primary = sc.textFile(primary, minPartitions=4, use_unicode=True).distinct() 
    rdd_primary = sc.textFile(SparkFiles.get(primary), minPartitions=4, use_unicode=True).distinct() 
    rdd_primary.partitionBy(10).cache()

    os.system('rm -Rf collects_*')
    os.system('rm -Rf holder.txt')
       
    rdd_secondary = sc.textFile(secondary, minPartitions=4, use_unicode=True).distinct()
    rdd_secondary.partitionBy(10).cache()

    primary_count = rdd_primary.count()
    primary_report['count'] = primary_count
    print(primary_report)

    secondary_count = rdd_secondary.count()
    secondary_report['count'] = secondary_count
    print(secondary_report)

    # Return each Primary file line/record not contained in Secondary
    not_in_primary  = rdd_primary.subtract(rdd_secondary)
    primary_diff = not_in_primary.count()
    primary_report['diff'] = primary_diff
    
    os.system('rm -Rf collects_*.csv')

    primary_dir = 'collects_{}_primary'.format(run_date)
    primary_report_name = 'collects_{}_primary_report.csv'.format(run_date)

    not_in_primary.coalesce(1, True).saveAsTextFile(primary_dir)

    # os.system('cat collects_{}_primary/part-0000* >> collects_{}_primary_report.csv'.format(run_date, run_date))
    os.system('cat {}/part-0000* >> {}'.format(primary_dir, primary_report_name))
    os.system('wc -l collects_{}_primary_report.csv'.format(run_date))

    # Flip Primary Vs Secondary
    # Return each Secondary file line/record not contained in Primary
    not_in_secondary  = rdd_secondary.subtract(rdd_primary)
    secondary_diff = not_in_secondary.count()
    secondary_report['diff'] = secondary_diff

    not_in_secondary.coalesce(1,True).saveAsTextFile('collects_{}_secondary'.format(run_date))
    os.system('cat collects_{}_secondary/part-0000* >> collects_{}_secondary_report.csv'.format(run_date, run_date))
    os.system('wc -l collects_{}_secondary_report.csv'.format(run_date))

    process_report['primary'] = primary_report
    process_report['secondary'] =  secondary_report

    print("=" * 100)
    print('\n')
    print(process_report)
    print('\n')
    print("=" * 100)
    spark_details(sc)
    

    sc.stop()
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import json
import re


if __name__ == '__main__':
    sc = SparkContext(appName='PythonSparkStreamingKafka')
    sc.setLogLevel(logLevel='WARN')

    ssc = StreamingContext(sparkContext=sc, batchDuration=1)
    kafkaStream = KafkaUtils.createDirectStream(ssc=ssc, topics=['trump'],
                                                kafkaParams={"metadata.broker.list": 'localhost:9092'})

    regex = re.compile('\\w+')

    lines = kafkaStream.map(lambda line: json.loads(line[1])).\
        filter(lambda d: d.get('lang', '') == 'en').\
        flatMap(lambda d: regex.findall(d['text'].lower())).\
        map(lambda word: (word, 1)).\
        reduceByKey(lambda x, y: x+y)
    lines.pprint()
    lines.saveAsTextFiles('hdfs:///home/hduser/test_1')
    ssc.start()
    ssc.awaitTermination()
STREAM_OUT = 'stream-OUT'

# We first delete all files from the STREAM_IN folder
# before starting spark streaming.
# This way, all files are new
print("Deleting existing files in %s ..." % STREAM_IN)
p = Path('.') / STREAM_IN
for f in p.glob("*.ordtmp"):
    os.remove(f)
print("... done")

from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext

sc = SparkContext("local[*]", "CountAndVolumePerBatch")
sc.setLogLevel(
    "WARN")  #Make sure warnings and errors observed by spark are printed.

ssc = StreamingContext(sc, 5)  #generate a mini-batch every 5 seconds
filestream = ssc.textFileStream(
    STREAM_IN)  #monitor new files in folder stream-IN


def parseOrder(line):
    '''parses a single line in the orders file'''
    s = line.split(",")
    try:
        if s[6] != "B" and s[6] != "S":
            raise Exception('Wrong format')
        return [{
            "time": datetime.strptime(s[0], "%Y-%m-%d %H:%M:%S"),
            "orderId": int(s[1]),
Esempio n. 40
0
    parser = argparse.ArgumentParser(description='Prepare data')
    parser.add_argument('config_file')
    args = parser.parse_args()

    # Load config file
    with open(args.config_file, 'r') as ymlfile:
        cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)
        resolve_placeholder(cfg)

    cfg_log = cfg['log']
    cfg_rti = cfg['pipeline']['rti_transform']

    sc = SparkContext()
    hive_context = HiveContext(sc)
    sc.setLogLevel(cfg_log['level'])

    default_hour = cfg_rti['default_hour']
    default_price_cat = cfg_rti['default_price_cat']
    day_step = cfg_rti['day_step']
    start_day = cfg_rti['start_day']
    end_day = cfg_rti['end_day']
    new_bucket_size = cfg_rti['new_bucket_size']
    input_table = cfg_rti['input_table']
    output_table = cfg['factdata_table_name']

    run(hive_context=hive_context,
        input_table=input_table,
        output_table=output_table,
        start_day=start_day,
        end_day=end_day,
import traceback


# In[2]:

try:

    try:
        timespan=str(sys.argv[1])
    except IndexError:
        print 'please pass timespan in argument'
        sys.exit()

    conf = (SparkConf().setMaster("local").setAppName("hi_report_app").set("spark.executor.memory", "1g"))
    sc = SparkContext(conf = conf)
    sc.setLogLevel("Error")
    sqlContext = SQLContext(sc)
    # In[2]:

    config_url='https://s3-ap-southeast-1.amazonaws.com/nlplive.humanindex.data/config.json'
    try:
        config_response=requests.get(config_url)
        config = json.loads(config_response.content)
    except:
        print "Cannot fetch Config......"

    # In[23]:

    try:
        fetch_response=requests.get(str(config['baseAPIUrl'])+'/'+str(config['version'])+'/preProcessing/GetPredictionFileJob/'+timespan+'/publisher')
        #check if api request is successfull or not
Esempio n. 42
0
def error(point, kmeans):
    """ Convert Apache time format into a Python datetime object
    Args:
        point: point to predict in model
        kmeans (KMeansModel object): trained k-means model
    Returns:
        float: Calculate the within cluster squared error distance and return total for model
    """
    center = kmeans.centers[kmeans.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


if __name__ == '__main__':
    # Initialize SparkContext object
    sc = SparkContext(appName="PythonDetectDDOS")
    sc.setLogLevel("ERROR")  # Reduce logging
    sqlContext = sql.SQLContext(sc)

    # Path to log input file
    logFile = "/user/root/src/Project - Developer - apache-access-log (4).txt.gz"

    # Read log text file and parse based on Apache log standard
    parsed_logs, access_logs = parseLogs(sc, logFile)

    # Process data for feature columns to be used in training
    df4 = dataProcessing(access_logs)
    df4.show()

    # Format DataFrame into Dense Vector for mllib K-means clustering
    data7 = df4.rdd.map(lambda row: Vectors.dense(row[2], row[3]))
    data7.cache()
import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: script.py <zk> <topic>", file=sys.stderr)
        exit(-1)

    zkQuorum, topic = sys.argv[1:]

    sc = SparkContext(appName="KafkaSparkStreaming")
    sc.setLogLevel("WARN")
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")

    ks = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 42})

    def processInput(line):
        fields = line[1].split("\t")
        return ((str(fields[6]), 1), (str(fields[7]), 1))

    def updateFunction(newValues, runningCount):
        return sum(newValues, runningCount or 0)

    digest = ks.flatMap(processInput)\
               .updateStateByKey(updateFunction)\
               .transform(lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False)\
Esempio n. 44
0
def main():
    "Main function"
    optmgr  = OptionParser()
    opts = optmgr.parser.parse_args()

    # setup spark/sql context to be used for communication with HDFS
    sc = SparkContext(appName="phedex_br")
    if not opts.yarn:
        sc.setLogLevel("ERROR")
    sqlContext = HiveContext(sc)

    schema_def = schema()

    # read given file(s) into RDD
    if opts.fname:
        pdf = sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(opts.fname, schema = schema_def)
    elif opts.basedir:
        fromdate, todate = defDates(opts.fromdate, opts.todate)
        files = getFileList(opts.basedir, fromdate, todate)
        msg = "Between dates %s and %s found %d directories" % (fromdate, todate, len(files))
        print msg

        if not files:
            return
        pdf = unionAll([sqlContext.read.format('com.databricks.spark.csv')
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(file_path, schema = schema_def) \
                        for file_path in files])
    else:
        raise ValueError("File or directory not specified. Specify fname or basedir parameters.")

    # parsing additional data (to given data adding: group name, node kind, acquisition era, data tier, now date)
    groupdic, nodedic = getJoinDic()
    acquisition_era_reg = r"^/[^/]*/([^/^-]*)-[^/]*/[^/]*$"	
    data_tier_reg = r"^/[^/]*/[^/^-]*-[^/]*/([^/]*)$"
    groupf = udf(lambda x: groupdic[x], StringType())
    nodef = udf(lambda x: nodedic[x], StringType())

    ndf = pdf.withColumn("br_user_group", groupf(pdf.br_user_group_id)) \
         .withColumn("node_kind", nodef(pdf.node_id)) \
         .withColumn("now", from_unixtime(pdf.now_sec, "YYYY-MM-dd")) \
         .withColumn("acquisition_era", when(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1) == "",\
                    lit("null")).otherwise(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1))) \
        .withColumn("data_tier", when(regexp_extract(pdf.dataset_name, data_tier_reg, 1) == "",\
                    lit("null")).otherwise(regexp_extract(pdf.dataset_name, data_tier_reg, 1)))

	# print dataframe schema
    if opts.verbose:
        ndf.show()
        print("pdf data type", type(ndf))
        ndf.printSchema()

    # process aggregation parameters
    keys = [key.lower().strip() for key in opts.keys.split(',')]
    results = [result.lower().strip() for result in opts.results.split(',')]
    aggregations = [agg.strip() for agg in opts.aggregations.split(',')]
    order = [orde.strip() for orde in opts.order.split(',')] if opts.order else []
    asc = [asce.strip() for asce in opts.asc.split(',')] if opts.order else []
    filtc, filtv = opts.filt.split(":") if opts.filt else (None,None)

    validateAggregationParams(keys, results, aggregations, order, filtc)

    if filtc and filtv:
        ndf = ndf.filter(getattr(ndf, filtc) == filtv)

    # if delta aggregation is used
    if DELTA in aggregations:
        validateDeltaParam(opts.interval, results)			
        result = results[0]

        #1 for all dates generate interval group dictionary
        datedic = generateDateDict(fromdate, todate, opts.interval)
        boundic = generateBoundDict(datedic)
        max_interval = max(datedic.values())

        interval_group = udf(lambda x: datedic[x], IntegerType())
        interval_start = udf(lambda x: boundic[x][0], StringType())		
        interval_end = udf(lambda x: boundic[x][1], StringType())

        #2 group data by block, node, interval and last result in the interval
        ndf = ndf.select(ndf.block_name, ndf.node_name, ndf.now, getattr(ndf, result))
        idf = ndf.withColumn("interval_group", interval_group(ndf.now))
        win = Window.partitionBy(idf.block_name, idf.node_name, idf.interval_group).orderBy(idf.now.desc())	
        idf = idf.withColumn("row_number", rowNumber().over(win))
        rdf = idf.where((idf.row_number == 1) & (idf.interval_group != 0))\
                 .withColumn(result, when(idf.now == interval_end(idf.interval_group), getattr(idf, result)).otherwise(lit(0)))
        rdf = rdf.select(rdf.block_name, rdf.node_name, rdf.interval_group, getattr(rdf, result))
        rdf.cache()

        #3 create intervals that not exist but has minus delta
        win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group)
        adf = rdf.withColumn("interval_group_aft", lead(rdf.interval_group, 1, 0).over(win))
        hdf = adf.filter(((adf.interval_group + 1) != adf.interval_group_aft) & (adf.interval_group != max_interval))\
                 .withColumn("interval_group", adf.interval_group + 1)\
                 .withColumn(result, lit(0))\
                 .drop(adf.interval_group_aft)

        #4 join data frames
        idf = rdf.unionAll(hdf)
		
        #3 join every interval with previous interval
        win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group)
        fdf = idf.withColumn("delta", getattr(idf, result) - lag(getattr(idf, result), 1, 0).over(win))

        #5 calculate delta_plus and delta_minus columns and aggregate by date and node
        ddf =fdf.withColumn("delta_plus", when(fdf.delta > 0, fdf.delta).otherwise(0)) \
                .withColumn("delta_minus", when(fdf.delta < 0, fdf.delta).otherwise(0))

        aggres = ddf.groupBy(ddf.node_name, ddf.interval_group).agg(sum(ddf.delta_plus).alias("delta_plus"),\
                                                                    sum(ddf.delta_minus).alias("delta_minus"))

        aggres = aggres.select(aggres.node_name, interval_end(aggres.interval_group).alias("date"), aggres.delta_plus, aggres.delta_minus)
		
    else:	
        resAgg_dic = zipResultAgg(results, aggregations)
        order, asc = formOrdAsc(order, asc, resAgg_dic)

        # perform aggregation
        if order:
            aggres = ndf.groupBy(keys).agg(resAgg_dic).orderBy(order, ascending=asc)
        else:
            aggres = ndf.groupBy(keys).agg(resAgg_dic)

    # output results
    if opts.fout:
        fout_header = formFileHeader(opts.fout)
        if opts.header:
            aggres.write.format('com.databricks.spark.csv').options(header = 'true').save(fout_header)
        else:
            aggres.write.format('com.databricks.spark.csv').save(fout_header)
    else:
        aggres.show(50)
def main(): 
	root =  os.path.dirname(os.path.abspath(__file__))

	print("Digits Handwriting Recognition using Spark")
	print("Root file path is = %s" %root)
	conf = SparkConf().setAppName("OCR")
	sc = SparkContext(conf = conf)
	sc.setLogLevel("WARN")

	sqlContext = SQLContext(sc)


	print("loading dataset")
	trainRDD = MLUtils.loadLibSVMFile(sc, root + "/dataset/svm/mnist")
	testRDD = MLUtils.loadLibSVMFile(sc, root + "/dataset/svm/mnist.t")

	# check if rdd support toDF
	if not hasattr(trainRDD, "toDF"):
        	print("ERROR: RDD does not support toDF")
        	os.exit(1)


	## convert RDDs to data frames
	trainDF = trainRDD.toDF()
	testDF = testRDD.toDF()

	print("INFO: train dataframe count = %u" %trainDF.count())
	print("INFO: test dataframe count = %u" %testDF.count())

	indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
	dtc = DecisionTreeClassifier(labelCol="indexedLabel")

	pipeline = Pipeline(stages=[indexer, dtc])
	model = pipeline.fit(trainDF)


	## train multiple depth models
	variedMaxDepthModels = []
	
	print("Create varied depth CNN models [1..8]")
	for mdepth in xrange(1, 9):
		start = time.time()			

		## maximum depth
		dtc.setMaxDepth(mdepth)
		
		## create pipeline
		pipeline = Pipeline(stages = [indexer, dtc])
		
		## create the model
		model = pipeline.fit(trainDF)
		
		## add to varied container
		variedMaxDepthModels.append(model)

		end = time.time()

		print("trained a CNN depth of %u, duration = [%.3f] secs" %(mdepth, end - start))
	
	print("=================================================")

	## report model accuraries
	evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", metricName="precision")
	
	## mdepth
	print("Evaluate all models precision")
	for mdepth in xrange(1, 9):
		model = variedMaxDepthModels[mdepth - 1]
		
		predictions  = model.transform(testDF)
		
		precision = evaluator.evaluate(predictions)
		
		print("CNN depth = %u, precision = %.3f" %(mdepth, precision))

				
		
	print("Finished processing %u digits" %testDF.count())
Esempio n. 46
0
from pyspark import SparkContext

sc = SparkContext(appName="streamingkafka")
sc.setLogLevel("WARN")  # 减少shell打印日志
rdd = sc.textFile('daily_IBM.csv')
rdd = rdd.flatMap(lambda x: x.split(','))
print(rdd.collect())


Esempio n. 47
0
  
from sklearn.metrics.cluster import normalized_mutual_info_score
from pyspark import SparkContext
import json

sc = SparkContext(appName="INF553HW5", master="local[*]")
sc.setLogLevel("WARN")
sc.setLogLevel("ERROR")

clustering_file_path = "o1"
label_path = "../resource/asnlib/publicdata/cluster2.json"

ground_truth = sc.textFile(label_path).map(lambda line: json.loads(line)). \
    flatMap(lambda line: [(index, label) for index, label in line.items()]). \
    map(lambda pair: (int(pair[0]), pair[1])). \
    collect()
ground_truth.sort()
ground_truth = [cid for _, cid in ground_truth]

ground_truth_cluster_size = sc.textFile(label_path).map(lambda line: json.loads(line)). \
    flatMap(lambda line: [(index, label) for index, label in line.items()]). \
    map(lambda pair: (pair[1], pair[0])). \
    groupByKey(). \
    mapValues(len).collect()
ground_truth_cluster_size.sort(key=lambda pair: pair[1])

prediction_cluster_size = sc.textFile(clustering_file_path).map(lambda line: json.loads(line)). \
    flatMap(lambda line: [(index, label) for index, label in line.items()]). \
    map(lambda pair: (pair[1], pair[0])). \
    groupByKey(). \
    mapValues(len).collect()
Esempio n. 48
0
def aggregate(hdir, cond, precision, min_date, max_date):
    "Collect aggregated statistics from HDFS"

    start_time = time.time()
    print("Aggregating {} FWJR performance data in {} matching {} from {} to {}...".format(precision.replace('y', 'i') + 'ly', hdir, cond, min_date, max_date))

    conf = SparkConf().setAppName("wmarchive fwjr aggregator")
    sc = SparkContext(conf=conf)
    sc.setLogLevel("ERROR")
    sqlContext = HiveContext(sc)

    # To test the procedure in an interactive pyspark shell:
    #
    # 1. Open a pyspark shell with appropriate configuration with:
    #
    # ```
    # pyspark --packages com.databricks:spark-avro_2.10:2.0.1 --driver-class-path=/usr/lib/hive/lib/* --driver-java-options=-Dspark.executor.extraClassPath=/usr/lib/hive/lib/*
    # ```

    # 2. Paste this:
    #
    # >>>
    # from pyspark.sql.functions import *
    # from pyspark.sql.types import *
    # hdir = '/cms/wmarchive/avro/2016/06/28*'
    # precision = 'day'
    fwjr_df = sqlContext.read.format("com.databricks.spark.avro").load(hdir)
    # <<<

    # Here we process the filters given by `cond`.
    # TODO: Filter by min_date and max_date and possibly just remove the `hdir` option and instead process the entire dataset, or make it optional.
    fwjr_df = make_filters(fwjr_df, cond)

    # 3. Paste this:
    #
    # >>>

    # Select the data we are interested in
    jobs = fwjr_df.select(
        fwjr_df['meta_data.ts'].alias('timestamp'),
        fwjr_df['meta_data.jobstate'],
        fwjr_df['meta_data.host'],
        fwjr_df['meta_data.jobtype'],
        fwjr_df['task'],
        fwjr_df['steps.site'].getItem(0).alias('site'), # TODO: improve
        fwjr_df['steps'], # TODO: `explode` here, see below
        # TODO: also select `meta_data.fwjr_id`
    )

    # Transfrom each record to the data we then want to group by:

    # Transform timestamp to start_date and end_date with given precision,
    # thus producing many jobs that have the same start_date and end_date.
    # These will later be grouped by.
    timestamp = jobs['timestamp']
    if precision == "hour":
        start_date = floor(timestamp / 3600) * 3600
        end_date = start_date + 3600
    elif precision == "day":
        start_date = floor(timestamp / 86400) * 86400
        end_date = start_date + 86400
    elif precision == "week":
        end_date = next_day(to_date(from_unixtime(timestamp)), 'Mon')
        start_date = date_sub(end_date, 7)
        start_date = to_utc_timestamp(start_date, 'UTC')
        end_date = to_utc_timestamp(end_date, 'UTC')
    elif precision == "month":
        start_date = trunc(to_date(from_unixtime(timestamp)), 'month')
        end_date = date_add(last_day(start_date), 1)
        start_date = to_utc_timestamp(start_date, 'UTC')
        end_date = to_utc_timestamp(end_date, 'UTC')

    jobs = jobs.withColumn('start_date', start_date)
    jobs = jobs.withColumn('end_date', end_date)
    jobs = jobs.withColumn('timeframe_precision', lit(precision))
    jobs = jobs.drop('timestamp')

    # Transform `task` to task and workflow name
    jobs = jobs.withColumn('taskname_components', split(jobs['task'], '/'))
    jobs = jobs.withColumn('workflow', jobs['taskname_components'].getItem(1))
    jobs = jobs.withColumn('task', jobs['taskname_components'].getItem(size(jobs['taskname_components'])))
    jobs = jobs.drop('taskname_components')

    # Extract exit code and acquisition era
    stepScopeStruct = StructType([
        StructField('exitCode', StringType(), True),
        StructField('exitStep', StringType(), True),
        StructField('acquisitionEra', StringType(), True),
    ])
    def extract_step_scope(step_names, step_errors, step_outputs):
        # TODO: improve this rather crude implementation
        exitCode = None
        exitStep = None
        for (i, errors) in enumerate(step_errors):
            if len(errors) > 0:
                exitCode = errors[0].exitCode
                exitStep = step_names[i]
                break
        acquisitionEra = None
        for outputs in step_outputs:
            if len(outputs) > 0:
                acquisitionEra = outputs[0].acquisitionEra
                break
        return (exitCode, exitStep, acquisitionEra)

    extract_step_scope_udf = udf(extract_step_scope, stepScopeStruct)
    jobs = jobs.withColumn('step_scope', extract_step_scope_udf('steps.name', 'steps.errors', 'steps.output'))
    jobs = jobs.select('*', 'step_scope.exitCode', 'step_scope.exitStep', 'step_scope.acquisitionEra').drop('step_scope')

    # <<<

    # You can check the schema at any time with:
    # ```
    # jobs.printSchema()
    # ```

    # TODO: Phase 1: Aggregation over steps
    #
    #       Each job has a list of `steps`, each with a `performance` dictionary.
    #       These performance dictionaries must be combined to one by summing their
    #       values, or possibly in a different way for each metric.
    #       E.g. if a job has 3 steps, where 2 of them have a `performance`
    #       dictionary with values such as `performance.cpu.TotalJobTime: 1` and
    #       `performance.cpu.TotalJobTime: 2`, then as a result the _job_ should
    #       have a `performance` dictionary with `performance.cpu.TotalJobTime: 3`.
    #
    #       All keys in the `performance` schema should be aggregated over in this fashion.
    #       The performance metrics are documented in https://github.com/knly/WMArchiveAggregation
    #       with a reference to `WMArchive/src/maps/metrics.json`.
    #
    #       To achieve this aggregation using pyspark-native functions, we should
    #       `explode` on the `steps` array and possibly even further down into
    #       `output` and/or `errors`, keeping track of the `meta_data.fwjr_id`.
    #       Then we can group by the `fwjr_id` and make use of the pyspark aggregation
    #       functions such as `pyspark.sql.functions.sum` similar to below.

    # Phase 2: Aggregation over jobs

    # Group jobs by scope
    # TODO: Explore if this is a performance bottleneck since everything
    #       is processed on one node. An approach based on a `reduce` function
    #       may be more feasable. That said, the `groupBy` is exactly
    #       the functionality we want to achieve and is pyspark-native,
    #       so I believe we should test this first and see if it really
    #       leads to any problems.
    scopes = jobs.groupBy([
        'start_date',
        'end_date',
        'timeframe_precision',
        'jobstate',
        'host',
        'jobtype',
        'site',
        'workflow',
        'task',
        'acquisitionEra',
        'exitCode',
        'exitStep',
    ])

    # Perform the aggregation over the grouped jobs
    stats = scopes.agg(*(
        [
            count('jobstate').alias('count')
        ] + [
            # TODO: Specify all aggregation keys here by reading the `performance` schema
            #       to take the average over all jobs.
            # avg(aggregation_key) for aggregation_key in aggregation_keys
        ]
    )).collect()

    # TODO: Reshape, so that the grouped-by keys are shifted into a `scope` dictionary
    #       and the aggregated performance metrics are shifted into a `performance`
    #       dictionary, to finally achieve the data structure detailed in
    #       https://github.com/knly/WMArchiveAggregation
    stats = [row.asDict() for row in stats]

    print("Aggregation finished in {} seconds.".format(time.time() - start_time))
#     print("Result of aggregation: {}".format(stats))

    return stats
  json_tweet = json.loads(tweet)
  if 'retweet_count' in json_tweet:
    text = json_tweet["text"]
    RT_text = text[:2]
    if json_tweet['retweet_count'] > 0 or RT_text == "RT":
      return True
  return False

def print_result(**kwargs):
  print("---------------------", kwargs["source_type"], "--------------")
  print("All", kwargs["all"].value)
  print("Retweeted", kwargs["retweeted"].value)
  
conf = SparkConf().setAppName("Twitter tweets listener").setMaster('local[2]')
sparkContext = SparkContext(conf=conf)
sparkContext.setLogLevel("ERROR")

streamingContext = StreamingContext(sparkContext, 1)

android_count = sparkContext.accumulator(0)
iphone_count = sparkContext.accumulator(0)

android_retweeted_count = sparkContext.accumulator(0)
iphone_retweeted_count = sparkContext.accumulator(0)

dstream = streamingContext.socketTextStream(HOST, PORT)

sampled = dstream.transform(samping_function)
json_objects = sampled.filter(lambda input: filter_tweets_source(input))
filtered = json_objects.filter(lambda input: filter_retweeted(input))
Esempio n. 50
0
s_logger = logging.getLogger('py4j.java_gateway')
s_logger.setLevel(logging.ERROR)

#pip install graphframes
# os.environ["PYSPARK_SUBMIT_ARGS"] = ("--packages graphframes:graphframes:0.6.0-spark2.3-s_2.11")

scConf = pyspark.SparkConf() \
    .setAppName('hw4') \
    .setMaster('local[3]')
sc = SparkContext(conf=scConf)

#sc = SparkContext('local[*]', 'task1')

# sc = SparkContext.getOrCreate()
# sqlContext = SQLContext(sc)
sc.setLogLevel('ERROR')

N = 7
#N=int(sys.argv[1])

input_file_path = '../../PycharmProjects/553hw4/ub_sample_data.csv'
#input_file_path = sys.argv[2]
textRDD = sc.textFile(input_file_path).persist()

output_file_path = '../../PycharmProjects/553hw4/betweenness.txt'
#output_file_path = sys.argv[3]

output_file_path2 = '../../PycharmProjects/553hw4/community.txt'
#output_file_path2 = sys.argv[4]

user_id_list = textRDD.map(lambda line: line.split(",")).filter(
 def __init__(self, input, output):
     conf = SparkConf().setMaster('local').setAppName('URI-MapReduce')
     sc = SparkContext(conf=conf)
     sc.setLogLevel("WARN")
     self.input_rdd = sc.textFile(input)
     self.output = output
# WARNING: This code was developed on a Python 2.7 and spark-1.5.0
# build and may not run as expected on other configurations.
#
######################

import re
import math
from scipy.stats import poisson
import time
import sys, getopt
import os

# Initialize Spark
from pyspark import SparkContext
sc = SparkContext()
sc.setLogLevel('ERROR')

######################
#
# Submission by Gioia Dominedo (Harvard ID: 40966234) for
# CS 205 - Computing Foundations for Computational Science
# 
# This is part of a joint project with Kendrick Lo that includes a
# separate component for word-level checking. This script includes 
# one of three SPARK implementations for context-level spell-checking
# adapted from third party algorithms (Symspell and Viterbi algorithms). 
#
# The following were also used as references:
# Peter Norvig, How to Write a Spelling Corrector
#   (http://norvig.com/spell-correct.html)
# Peter Norvig, Natural Language Corpus Data: Beautiful Data
    # number of distinct elements
    expectedEstimate = 2**numHashes

    random.seed(SEED)
    a = random.choices([x for x in range(1000, 30000) if isPrime(x)],
                       k=numHashes + 1)
    # print("a: ", a)
    b = random.choices([x for x in range(1000, 30000) if isPrime(x)],
                       k=numHashes + 1)
    # print("b: ", b)

    # Create a local StreamingContext with two working thread and batch interval of 1 second
    # SparkContext.setSystemProperty('spark.executor.memory', '4g')
    # SparkContext.setSystemProperty('spark.driver.memory', '4g')
    sc = SparkContext("local[*]", "countDistinctCity")
    sc.setLogLevel(logLevel="OFF")

    # batch interval
    ssc = StreamingContext(sc, batch_size)

    outputFile = open(output_file_path, "w", encoding="utf-8")
    out = "Time,Ground Truth,Estimation" + "\n"
    outputFile.write(out)

    # Create a Data Stream that connects to localhost:9999
    dataRDD = ssc.socketTextStream("localhost", port_number)

    # modify to obtain state of incoming business, then apply bloom filter
    resultRDD = dataRDD.map(json.loads).map(lambda x: x['city'])\
        .window(windowDuration=window_length, slideDuration=sliding_interval)\
        .foreachRDD(FMAlgo)
Esempio n. 54
0
parser.add_argument('-k', dest='kmercount', type=int)
args = parser.parse_args()
k = args.kmercount

def get_samples(filename):
    samples = []
    for sample in open(filename).readlines():
        samples.append(sample.strip()[:-4])
    return samples

samples = get_samples('/root/istc_oceanography/metadata/valid_samples_GA02_filenames.csv')
samples += get_samples('/root/istc_oceanography/metadata/valid_samples_GA03_filenames.csv')

master_url = open("/root/spark-ec2/cluster-url").read().strip()
context = SparkContext(master_url)
context.setLogLevel("WARN")
sqlcontext = SQLContext(context)

def extract_kmers(r):
    for i in range(0,len(r.seq)-k+1):
        yield r.seq[i:i+k]

for sample_name in samples:
    sample_filename = "s3n://helgag/ocean_metagenome/overlapped/{sample_name}.csv".format(sample_name=sample_name)
    customSchema = StructType([ \
                StructField("id", StringType(), True), \
                StructField("seq", StringType(), True)])
    sample = sqlcontext.read.format('com.databricks.spark.csv').options(header='true').load(sample_filename, schema=customSchema).repartition(80)
    sample = sample.flatMap(extract_kmers).map(Row("kmer")).toDF().groupBy("kmer").agg(count("*"))
    #Toggle comment the following to export the data
    sample.registerTempTable(sample_name + "_count")
Esempio n. 55
0
import re
from pyspark import SparkContext
from datetime import datetime
import matplotlib
import matplotlib.pyplot as plot
import numpy as np
import zipfile


sparkContext = SparkContext()
sparkContext.setLogLevel('error')



# numbers.txt
startTime = datetime.now()
file = sparkContext.textFile("Numbers.zip/numbers.txt")
numbers = file.flatMap(lambda line: line.split(" "))
print('#####################################################################')
print('#############################  OUTPUT  ##############################')
print('[NUMBERS.TXT] [MEAN] :' +str(numbers.map(lambda num: float(num)).mean()))
print('[NUMBERS.TXT] [STDEV] :'+str(numbers.map(lambda num: float(num)).stdev()))
print('[NUMBERS.TXT] [VARIANCE] :'+str(numbers.map(lambda num: float(num)).variance()))
endTime = datetime.now()
duration = endTime - startTime

# numbers2.txt
startTime2 = datetime.now()
file2 = sparkContext.textFile("numbers2.txt")
numbers2 = file2.flatMap(lambda line: line.split(" "))
print('#####################################################################')
Esempio n. 56
0
# build and may not run as expected on other configurations.
#
######################

import re
import math
from scipy.stats import poisson
import time
import sys, getopt
import os

# Initialize Spark
from pyspark import SparkContext

sc = SparkContext()
sc.setLogLevel("ERROR")

######################
#
# Submission by Gioia Dominedo (Harvard ID: 40966234) for
# CS 205 - Computing Foundations for Computational Science
#
# This is part of a joint project with Kendrick Lo that includes a
# separate component for word-level checking. This script includes
# one of three SPARK implementations for context-level spell-checking
# adapted from third party algorithms (Symspell and Viterbi algorithms).
#
# The following were also used as references:
# Peter Norvig, How to Write a Spelling Corrector
#   (http://norvig.com/spell-correct.html)
# Peter Norvig, Natural Language Corpus Data: Beautiful Data
Esempio n. 57
0
def main():
	parser = argparse.ArgumentParser(description="sparK-mer")
	parser.add_argument("-N",metavar="INT", help="Number of nodes to use [%(default])", default=19, type=int)
	parser.add_argument("-C",metavar="INT", help="Cores per node [%(default)]", default=24, type=int)
	parser.add_argument("-E",metavar="INT", help="Cores per executor [%(default)]", default=4, type=int)
	parser.add_argument("-M",metavar="STR", help="Namenode", default="c252-104", type=str)
	parser.add_argument("-L",metavar="STR", help="Log level", default="WARN", type=str)
	parser.add_argument("-K",metavar="INT", help="k-mer size [%(default)]", default=15, type=int)
	parser.add_argument("-v", action="store_true", help="Verbose output")
	args = parser.parse_args()
	
	executorInstances = args.N*args.C/args.E

	# Explicitly set the storage level
	#StorageLevel(True, True, False, True, 1)
	
	# Set up spark configuration
	conf = SparkConf().setMaster("yarn-client").setAppName("sparK-mer")
	#conf = SparkConf().setMaster("local[16]").setAppName("sparK-mer")
	conf.set("yarn.nodemanager.resource.cpu_vcores",args.C)
	# Saturate with executors
	conf.set("spark.executor.instances",executorInstances)
	conf.set("spark.executor.heartbeatInterval","5s")
	# cores per executor
	conf.set("spark.executor.cores",args.E)
	# set driver cores
	conf.set("spark.driver.cores",12)
	# Number of akka threads
	conf.set("spark.akka.threads",256)
	# Agregation worker memory
	conf.set("spark.python.worker.memory","5g")
	# Maximum message size in MB
	conf.set("spark.akka.frameSize","128")
	conf.set("spark.akka.timeout","200s")
	conf.set("spark.akka.heartbeat.interval","10s")
	#conf.set("spark.broadcast.blockSize","128m")
	conf.set("spark.driver.maxResultSize", "20g")
	conf.set("spark.reducer.maxSizeInFlight","5g")
	conf.set("spark.executor.memory","7g")
	#conf.set("spark.shuffle.memoryFraction",0.4)
	#conf.set("spark.storage.memoryFraction",0.3)
	#conf.set("spark.storage.unrollFraction",0.3)
	#conf.set("spark.storage.memoryMapThreshold","256m")
	#conf.set("spark.kryoserializer.buffer.max","1g")
	#conf.set("spark.kryoserializer.buffer","128m")
	#conf.set("spark.core.connection.ack.wait.timeout","600")
	#conf.set("spark.shuffle.consolidateFiles","true")
	#conf.set("spark.shuffle.file.buffer","32m")
	conf.set("spark.shuffle.manager","sort")
	conf.set("spark.shuffle.spill","true")

	# Set up Spark Context
	sc = SparkContext("","",conf=conf)
	sc.setLogLevel(args.L)

	# Process DB
	#frequencyProfile = generateFP(sc, args.K, "hdfs://c252-104/user/gzynda/random_20", args.v)
	fpStart = time.time()
	frequencyProfile = generateFP(sc, args.K, "/user/gzynda/library", args.v)
	frequencyProfile.cache()
	nGenomes = frequencyProfile.count()
	fpSecs = time.time()-fpStart
	print "############################################"
	print "Counted %i genomes in %.2f seconds"%(nGenomes, fpSecs)
	print "############################################"

	# Parse FQ
	fqStart = time.time()
	fqFrequency = parseFQ(sc, args.K, "/user/gzynda/reads/HiSeq_accuracy.fq", args.v)
	fqFrequency.cache()
	nReads = fqFrequency.count()
	fqSecs = time.time()-fqStart
	print "############################################"
	print "Parsed %i reads in %.2f seconds"%(nReads, fqSecs)
	print "############################################"

	# Classify reads
	classStart = time.time()
	#classify(sc, fqFrequency, frequencyProfile, args.v)
	nReads = setClassify(sc, fqFrequency, frequencyProfile, args.v)
	classSecs = time.time()-classStart
	print "############################################"
	print "Classified %i reads in %.2f seconds"%(nReads, classSecs)
	print "Ran on %i executor instances"%(executorInstances)
	print "K = %i"%(args.K)
	print "############################################"
	sys.exit()