Python SparkContext.accumulator Beispiele, pyspark.SparkContext.accumulator Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.py Projekt: shrinivaasanka/asfer-github-code

def SearchTiles_and_Factorize(n): 
	global globalmergedtiles
	global globalcoordinates
	global factors_accum 
	global spcon

	spcon = SparkContext("local[4]","Spark_TileSearch_Optimized")

	if persisted_tiles == True:
        	tileintervalsf=open("/home/shrinivaasanka/Krishna_iResearch_OpenSource/GitHub/asfer-github-code/cpp-src/miscellaneous/DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.tileintervals","r")

        	tileintervalslist=tileintervalsf.read().split("\n")
		#print "tileintervalslist=",tileintervalslist
        	tileintervalslist_accum=spcon.accumulator(tilesintervalslist, VectorAccumulatorParam())
		paralleltileintervals=spcon.parallelize(tileintervalslist)
		paralleltileintervals.foreach(tilesearch)
	else:
		factorsfile=open("DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.factors","w")
		hardy_ramanujan_ray_shooting_queries(n)
		hardy_ramanujan_prime_number_theorem_ray_shooting_queries(n)
		baker_harman_pintz_ray_shooting_queries(n)
		cramer_ray_shooting_queries(n)
		zhang_ray_shooting_queries(n)
        	factors_accum=spcon.accumulator(factors_of_n, FactorsAccumulatorParam())
		#spcon.parallelize(xrange(1,n)).foreach(tilesearch_nonpersistent)
		spcon.parallelize(spcon.range(1,n).collect()).foreach(tilesearch_nonpersistent)
		print "factors_accum.value = ", factors_accum.value
		factors=[]
		factordict={}
		for f in factors_accum.value:
			factors += f
		factordict[n]=factors
		json.dump(factordict,factorsfile)
		return factors

Beispiel #2

0

Datei anzeigen

Datei: countGoodKeysByPublisher.py Projekt: cjsanjay/dig-crf

def main(argv=None):
    '''this is called if run from command line'''

    parser = argparse.ArgumentParser()
    parser.add_argument('-i','--input', help="Required Seq input file on cluster.", required=True)
    args = parser.parse_args()

    sc = SparkContext()
    global goodJsonRecords, badJsonRecords, noPublisherRecords, noPublisherNameRecords
    goodJsonRecords = sc.accumulator(0)
    badJsonRecords = sc.accumulator(0)
    noPublisherRecords = sc.accumulator(0)
    noPublisherNameRecords = sc.accumulator(0)
    data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    keyCounts = data.values().flatMap(getKeys).countByValue()

    print "========================================"
    print "goodJsonRecords = %d" % goodJsonRecords.value
    print "badJsonRecords = %d" % badJsonRecords.value
    print "noPublisherRecords = %d" % noPublisherRecords.value
    print "noPublisherNameRecords = %d" % noPublisherNameRecords.value
    for k in sorted(keyCounts):
        print k, keyCounts[k]
    print "========================================"

    sc.stop()

Beispiel #3

0

Datei anzeigen

Datei: countCrfResultTokens.py Projekt: cjsanjay/dig-crf

def main(argv=None):
    '''this is called if run from command line'''

    parser = argparse.ArgumentParser()
    parser.add_argument('-i','--input', help="Seq input file on cluster.", required=True)
    parser.add_argument('-o','--output', help="UTF-8 output file on cluster.", required=False)
    parser.add_argument('-p','--printToLog', help="Print results to log.", required=False, action='store_true')
    args = parser.parse_args()

    sc = SparkContext()
    global goodJsonRecords, badJsonRecords
    goodJsonRecords = sc.accumulator(0)
    badJsonRecords = sc.accumulator(0)
    data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    tagCounts = data.values().flatMap(getTokens).countByValue()

    # So far, this code isn't useful.  The output fiile is written by the
    # master node into an isolated folder, and I don't know of a way to
    # retrieve it.
    if args.output != None:
        with codecs.open(args.output, 'wb', 'utf-8') as f:
            for k in sorted(tagCounts):
                f.write(k + " " + str(tagCounts[k]) + "\n")

    print "========================================"
    print "goodJsonRecords = %d" % goodJsonRecords.value
    print "badJsonRecords = %d" % badJsonRecords.value
    if args.printToLog:
        for k in sorted(tagCounts):
            print json.dumps(k), tagCounts[k]
    print "========================================"

Beispiel #4

0

Datei anzeigen

Datei: countCrfResultTokensFancy.py Projekt: cjsanjay/dig-crf

def main(argv=None):
    '''this is called if run from command line'''

    parser = argparse.ArgumentParser()
    parser.add_argument('-i','--input', help="Seq input file on cluster.", required=True)
    args = parser.parse_args()

    sc = SparkContext()
    global goodJsonRecords, badJsonRecords
    goodJsonRecords = sc.accumulator(0)
    badJsonRecords = sc.accumulator(0)
    data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    tagTokenCounts = data.values().flatMap(getTokens).countByValue()
    sc.stop()

    print "========================================"
    print "goodJsonRecords = %d" % goodJsonRecords.value
    print "badJsonRecords = %d" % badJsonRecords.value
    print "========================================"

    # Restructure the data, grouping by tag (token type indicator):
    tagTokenLists = {}
    for tagToken in tagTokenCounts.keys():
        (tag, tokenValue) = tagToken.split(":", 1)
        count = tagTokenCounts[tagToken]
        if tag not in tagTokenLists:
            tagTokenLists[tag] = []
        tagTokenLists[tag].append(Token(tokenValue, count))

    # Process each tag seperately:
    for tag in tagTokenLists.keys():
        tokenList = tagTokenLists[tag]

        # Sort the tokens by descending count and ascending token value:
        sortedTokenList = sorted(tokenList, key=attrgetter("value"))
        sortedTokenList = sorted(sortedTokenList, key=attrgetter("count"), reverse=True)

        # Calculate the cumulative token count for each token in sorted order:
        totalTokens = 0
        for token in sortedTokenList:
            totalTokens += token.count
            token.cumulativeCount = totalTokens

        # We'll use the final total later, but we need it as a float to ensure
        # floating point division is used:
        floatTotalTokens = float(totalTokens)

        # Print the sorted tokens with cumulative counts, fraction of
        # total (cunumative distribution function), and index
        # (enumerate the tokens per tag, starting with 1).
        print "========================================"
        tokenIndex = 0
        for token in sortedTokenList:
            tokenIndex += 1
            fractionOfTotal = token.cumulativeCount / floatTotalTokens
            print("{0:8d} {1:50} {2:10d} {3:10d} {4:.5f}".format(tokenIndex, json.dumps(tag + ": " + token.value),
                                                                 token.count, token.cumulativeCount, fractionOfTotal))
        print "========================================"

Beispiel #5

0

Datei anzeigen

Datei: countCrfResultPhrases.py Projekt: usc-isi-i2/dig-crf

def main(argv=None):
    """this is called if run from command line"""

    parser = argparse.ArgumentParser()
    parser.add_argument("-e", "--excludeTags", help="Comma-separated list of tags to exclude.", required=False)
    parser.add_argument("--includeTags", help="Comma-separated list of tags to include.", required=False)
    parser.add_argument("-i", "--input", help="Seq or tuple input file.", required=True)
    parser.add_argument("--inputTuples", help="The input file is in tuple format.", required=False, action="store_true")
    parser.add_argument("-o", "--output", help="UTF-8 output file on cluster.", required=False)
    parser.add_argument("-p", "--printToLog", help="Print results to log.", required=False, action="store_true")
    args = parser.parse_args()

    if args.excludeTags and args.includeTags:
        print "Pick either --excludeTags or --includeTags, not both."
        return 1

    sc = SparkContext()

    global goodJsonRecords, badJsonRecords, excludedTagCount, includedTagCount, tokenCount
    goodJsonRecords = sc.accumulator(0)
    badJsonRecords = sc.accumulator(0)
    excludedTagCount = sc.accumulator(0)
    includedTagCount = sc.accumulator(0)
    tokenCount = sc.accumulator(0)

    if args.inputTuples:
        data = sc.textFile(args.input).map(lambda x: eval(x))
    else:
        data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    tagPhraseCounts = data.values().flatMap(getPhrasesMaker(args.includeTags, args.excludeTags)).countByValue()
    sc.stop()

    # So far, this code isn't useful.  The output fiile is written by the
    # master node into an isolated folder, and I don't know of a way to
    # retrieve it.
    if args.output != None:
        with codecs.open(args.output, "wb", "utf-8") as f:
            for k in sorted(tagPhraseCounts):
                f.write(k + " " + str(tagPhraseCounts[k]) + "\n")

    print "========================================"
    print "goodJsonRecords = %d" % goodJsonRecords.value
    print "badJsonRecords = %d" % badJsonRecords.value
    print "excludedTagCount = %d" % excludedTagCount.value
    print "includedTagCount = %d" % includedTagCount.value
    print "tokenCount = %d" % tokenCount.value
    if args.printToLog:
        for k in sorted(tagPhraseCounts):
            print json.dumps(k), tagPhraseCounts[k]
    print "========================================"

Beispiel #6

0

Datei anzeigen

Datei: Spark_Broadcast_Accumulator.py Projekt: shrinivaasanka/Grafit

def SparkBroadcastAccumulator(n): 
	global broadcast_var
	global accumulator_var
	spcon = SparkContext("local[2]","SparkBroadcastAccumulator")
	broadcast_var=spcon.broadcast("broadcast_message")
	accumulator_var=spcon.accumulator(0)
	spcon.parallelize(xrange(1,n)).foreach(lambda x: broadcast_accumulator_receiver(accumulator_var.add(x)))

Beispiel #7

0

Datei anzeigen

Datei: shared_motif.py Projekt: vardaofthevalier/Miscellaneous

def longest_common_substring(strands):
	pass
	# create the Spark context
	conf = SparkConf().setAppName("longest_common_substring")
	sc = SparkContext(conf=conf)

	# create an accumulator for key-value pairs, where each key is a substring, and each value is the set of strings where the substring can be found
	class ArrayAccumulatorParam(AccumulatorParam):
		def zero(self, initialValue):
			return initialValue

		def addInPlace(self, v1, v2):
			if type(v2) is list:
				v1.extend(v2)
			elif type(v2) is tuple:
				v1.append(v2)

			return v1

	acc = sc.accumulator([], ArrayAccumulatorParam())

	def generate_substrings(data_element):
		k, v = data_element
		i = 0
		while i < len(v):
			j = i + 1
			while j < len(v):
				acc.add((v[i:j],k))
				j += 1
			i += 1

	sc.parallelize([(k, v) for k, v in strands.iteritems()]).foreach(generate_substrings)

	all_substrings = sc.parallelize(acc.value)
	return all_substrings.groupByKey().filter(lambda x: set(list(x[1])) == set(strands.keys())).takeOrdered(1, key=lambda x: -len(x[0]))[0][0]

Beispiel #8

0

Datei anzeigen

Datei: countGoodKeysText.py Projekt: usc-isi-i2/dig-crf

def main(argv=None):
    '''this is called if run from command line'''

    parser = argparse.ArgumentParser()
    parser.add_argument('-i','--input', help="Required Seq input file on cluster.", required=True)
    args = parser.parse_args()

    sc = SparkContext()
    global goodJsonRecords, badJsonRecords
    goodJsonRecords = sc.accumulator(0)
    badJsonRecords = sc.accumulator(0)
    data = sc.textFile(args.input).map(lambda x: eval(x))
    keyCounts = data.values().flatMap(getKeys).countByValue()
    sc.stop()

    print "========================================"
    print "goodJsonRecords = %d" % goodJsonRecords.value
    print "badJsonRecords = %d" % badJsonRecords.value
    for k in sorted(keyCounts):
        print k, keyCounts[k]
    print "========================================"

    sc.stop()

Beispiel #9

0

Datei anzeigen

Datei: phrase_counter.py Projekt: ramtinms/Large-Scale-NLP

def word_count_compute(hdfs_input,hdfs_output,min_n,max_n):

    #TODO add start of sent
    # TODO large max size
    sums = []
    vocab_size = []
    sc = SparkContext("local","Simple Language Model Computing")
    file    = sc.textFile(hdfs_input)
    counts  = file.flatMap(lambda a : get_ngrams(a,min_n,max_n)).reduceByKey(lambda a, b: a + b)
    for i in range(min_n, max_n+1):
        temp = counts.filter(lambda a: is_ngram(a,i))
        accum = sc.accumulator(0)
        temp.foreach(lambda a: accum.add(a[1]))
        #temp_sum = sum(x[1] for x in temp.collect())
        sums.append(accum.value)
        temp_counts = temp.count()
        vocab_size.append(temp_counts)
        #print i,temp_counts,temp_sum

    print sums,vocab_size
    return counts,sums,vocab_size

Beispiel #10

0

Datei anzeigen

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: direct_kafka_wordcount.py <broker_list> <topic>",
              file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount")
    ssc = StreamingContext(sc, 2)
    sqlContext = SQLContext(sc)
    sc.setLogLevel("WARN")

    ##############
    # Globals
    ##############
    globals()['maxTemp'] = sc.accumulator(0.0)

    brokers, topic = sys.argv[1:]
    kvs = KafkaUtils.createDirectStream(ssc, [topic],
                                        {"metadata.broker.list": brokers})
    jsonDStream = kvs.map(lambda (key, value): value)

    # Define function to process RDDs of the json DStream to convert them
    #   to DataFrame and run SQL queries
    def process(time, rdd):
        # Match local function variables to global variables
        maxTemp = globals()['maxTemp']

        print("========= %s =========" % str(time))
        print("rdd = %s" % str(rdd))

Beispiel #11

0

Datei anzeigen

words_new = sc.broadcast(["scala", "java", "hadoop", "spark", "akka"])

data = words_new.value
print("Stored data -> {}".format(data))

elem = words_new.value[2]
print("Printing a particular element in RDD -> {}".format(elem))
"""
Accumulator variables are used for aggregating the information through associative and commutative operations.
For example, you can use an accumulator for a sum operation or counters (in MapReduce).

The following example shows how to use an Accumulator variable.
An Accumulator variable has an attribute called value that is similar to what a broadcast variable has. 
It stores the data and is used to return the accumulator's value, but usable only in a driver program.

In this example, an accumulator variable is used by multiple workers and returns an accumulated value.
"""

num = sc.accumulator(10)


def f(x):
    global num
    num += x


rdd = sc.parallelize([20, 30, 40, 50])
rdd.foreach(f)
final = num.value
print("Accumulated value is -> {}".format(final))

Beispiel #12

0

Datei anzeigen

Datei: 1-spark.py Projekt: shotii/Big-Data-Analytics

# multiple occurances per article shouldnt be counted
counts = words.groupByKey()
print("Absolut count of articles containing the word:", counts.count())
print("Relative count of articles containing the word:",
      float(counts.count()) / rdd.count())

# articles containing the word
articles = sc.textFile("/user/bigdata/wikipedia-text-tiny-clean500") \
            .filter(lambda x: WORD in re.compile('\w+').findall(x.lower()))

# Each article is saved in a file in the folder 'output'
articles.saveAsTextFile('output')

# using accumulators
overall_count = sc.accumulator(0)
word_count = sc.accumulator(0)


def increment(article):
    overall_count.add(1)
    if WORD in article:
        word_count.add(1)


rdd.foreach(increment)
print("Absolut count: %s, relative count %f" %
      (word_count, float(word_count.value) / overall_count.value))

# using data frames with sql
sqlContext = SQLContext(sc)

Beispiel #13

0

Datei anzeigen

Datei: degrees-of-seperation.py Projekt: David-Loughnane/udemy_pyspark

#Boilerplate stuff:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("DegreesOfSeparation")
sc = SparkContext(conf=conf)

# The characters we wish to find the degree of separation between:
startCharacterID = 5306  #SpiderMan
targetCharacterID = 14  #ADAM 3,031 (who?)

# Our accumulator, used to signal when we find the target character during
# our BFS traversal.
hitCounter = sc.accumulator(0)


def convertToBFS(line):
    fields = line.split()
    heroID = int(fields[0])
    connections = []
    for connection in fields[1:]:
        connections.append(int(connection))

    color = 'WHITE'
    distance = 9999

    if (heroID == startCharacterID):
        color = 'GRAY'
        distance = 0

    return (heroID, (connections, distance, color))

Beispiel #14

0

Datei anzeigen

Datei: edX-lab3.py Projekt: alfredox10/Spark-intro

    for y in xrange(length):
        if (x == y):
          bits.append(value)
        else:
          bits.append(0)
    return bits

# Pre-bin counts of false positives for different threshold ranges
BINS = 101
nthresholds = 100
def bin(similarity):
    return int(similarity * nthresholds)

# fpCounts[i] = number of entries (possible false positives) where bin(similarity) == i
zeros = [0] * BINS
fpCounts = sc.accumulator(zeros, VectorAccumulatorParam())

def add_element(score):
    global fpCounts
    b = bin(score)
    fpCounts += set_bit(b, 1, BINS)

simsFullValuesRDD.foreach(add_element)

# Remove true positives from FP counts
def sub_element(score):
    global fpCounts
    b = bin(score)
    fpCounts += set_bit(b, -1, BINS)

trueDupSimsRDD.foreach(sub_element)

Beispiel #15

0

Datei anzeigen

Datei: accumulator_test.py Projekt: 2221758805/SparkDemo

        a_wenting += 1
    elif "dongshen" in element:
        a_dongshen += 1
    elif "qifeng" in element:
        a_qifeng += 1
    else:
        a_else += 1
    return element


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: input <file>")
        exit(-1)

    sc = SparkContext(appName="accumulatorTest")
    # 创建四个统计要素
    a_wenting = sc.accumulator(0)
    a_dongshen = sc.accumulator(0)
    a_qifeng= sc.accumulator(0)
    a_else= sc.accumulator(0)

    lines = sc.textFile(sys.argv[1], 2).map(fun)

    for line in lines.collect():
        print line
    print a_wenting.value
    print a_else.value

    sc.stop()

Beispiel #16

0

Datei anzeigen

Datei: q10kill.py Projekt: malahx/ISTIC

from functools import reduce

# Import pyspark modules
from pyspark import SparkConf, SparkContext, StorageLevel  # noqa

# Create a Spark configuration object, to create a context into which we create
# a session => The only object to be manipulated here is the spark_context
spark_conf = SparkConf().setMaster('local[*]')
spark_context = SparkContext(conf=spark_conf)


def convert(a):
    return [x for x in a if a.count(x) > 1]


acc = spark_context.accumulator(0)
scoreVar = spark_context.broadcast([1, 42, 100, 9999])


def score(assist, dmg, kill, position):
    if kill == 0 or assist / kill >= 5:
        acc.add(1)
    s = assist * scoreVar.value[1] + dmg * scoreVar.value[0] + kill * scoreVar.value[2]
    if position == 1:
        s += scoreVar.value[3]
    return s


try:
    # date	game_size	match_id	match_mode	party_size	player_assists	player_dbno	player_dist_ride	player_dist_walk	player_dmg	player_kills	player_name	player_survive_time	team_id	team_placement

Beispiel #17

0

Datei anzeigen

from pyspark import SparkContext, SparkConf
import numpy as np
conf = SparkConf()
conf.set('master', 'spark://hadoop-maste:7077')
context = SparkContext(conf=conf)
acc = context.accumulator(0)
print(type(acc), acc.value)
rdd = context.parallelize(np.arange(101), 5)


def acc_add(a):
    acc.add(a)
    return a


rdd2 = rdd.map(acc_add)
print(rdd2.collect())
print(acc.value)
context.stop()

Beispiel #18

0

Datei anzeigen

Datei: degrees-of-separation.py Projekt: wee-analyze/starting_fires

#Boilerplate stuff:
from pyspark import SparkConf, SparkContext

#conf = SparkConf().setMaster("local").setAppName("DegreesOfSeparation")
#sc = SparkContext(conf = conf)

sc = SparkContext("yarn")

# The characters we wish to find the degree of separation between:
startCharacterID = 5306  #SpiderMan
targetCharacterID = 14  #ADAM 3,031 (who?)

# Our accumulator, used to signal when we find the target character during
# our BFS traversal.
hitCounter = sc.accumulator(
    0
)  # the variables updates from every node because each nodes update relayed back to the
# driver and aggregated.


def convertToBFS(line):
    fields = line.split()
    heroID = int(fields[0])
    connections = []
    for connection in fields[
            1:]:  # will append all fields in that element (row) starting from fields[1]
        connections.append(int(connection))

    color = 'WHITE'
    distance = 9999

Beispiel #19

0

Datei anzeigen

        f = inputFiles[i]
        if subFileCount == filePartitionSize:
            subDirs.append(subDir)
            subDirNum += 1
            subFileCount = 0
            subDir = str(subDirNum) + "/"
            os.makedirs(input_dir + subDir)
        shutil.move(input_dir + f, input_dir + subDir)
        subFileCount += 1
    if subFileCount == filePartitionSize:
        subDirs.append(subDir)

    sc = SparkContext("local[" + numCores + "]",
                      "job",
                      pyFiles=[realpath('helper.py')])
    timeLoads = sc.accumulator([0] * len(intervals), VectorAccumulatorParam())
    bs2imsi2wasActivePrevTime = defaultdict(lambda: defaultdict())

    prev_idx = 0
    for i in range(len(subDirs)):
        d = subDirs[i]
        end_idx = intervals.index(dirTimeBoundaries[i])
        intervalBoundary = (prev_idx + 1, end_idx)  #both indexes are included
        prev_idx = end_idx

        bs2data = sc.textFile(input_dir + d + '*.gz').filter(filterData).map(
            generateBS2Data).reduceByKey(reduceBS2IMSI2Data)
        bs2data.foreach(getAccumLoad)
        print len(bs2imsi2wasActivePrevTime)
        resetDirectories(subDirs, input_dir)
        sys.exit()

Beispiel #20

0

Datei anzeigen

Datei: spark_mlib.py Projekt: RahulReddy-Arva/San-Fransisco-Crime-Analysis

        categories[category] = float(len(categories))

    # Hashing used to convert categorical input features to numeric values
    htf = HashingTF(5000)

    # Perform feature extraction on train and test data splits to feed the data to algorithm
    trainingData = inputRdd.map(lambda x: LabeledPoint(
        categories[x[1]],
        htf.transform([x[2], x[3].split('/')[0], x[4].split(':')[0], x[8]])))
    testingSet = testRdd.map(lambda x: htf.transform(
        [x[2], x[3].split('/')[0], x[4].split(':')[0], x[8]]))

    # Train the model on the train split. Classifies a record with the probability of occurrence of a crime category given month, hour, weekday, area.
    model = NaiveBayes.train(trainingData, 1.0)
    # Use the trained model to predict test data. Returns predicted labels for each record.
    predictions = model.predict(testingSet)

    # Get actual labels for the test records
    label_actual = testRdd.map(lambda x: categories[x[1]])

    # Initialize counter to note instances labels being corrected accurately
    correct_labels = sc.accumulator(0)
    for label_a, label_p in zip(label_actual.collect(), predictions.collect()):
        if (label_a == label_p):
            correct_labels.add(1)

    print "Accuracy is: " + str(
        (float(correct_labels.value) / float(predictions.count())) * 100)

    sc.stop()

Beispiel #21

0

Datei anzeigen

Datei: Sum_of_numbers.py Projekt: veeraravi/CCA175

from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("Sum_of_numbers.py").setMaster("local[2]")
sc = SparkContext(conf=conf)

data = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]

numbersRdd = sc.parallelize(data)

sum = numbersRdd.reduce(lambda a,b: a+b)

print "Using reduce(): Sum Of given numbers is %i " %(sum)

sumAcc = sc.accumulator(0,"total")

numbersRdd.foreach(lambda x: sumAcc = sumAcc + x)

print "Using Accumulator: Sum Of given numbers is %i " %(sumAcc)

Beispiel #22

0

Datei anzeigen

Datei: stream.py Projekt: rajrohith/lambda-arch1

    for count, topic_probability in enumerate(topic_distribution.toArray().tolist()):
        topic_distribution_dict["topic_{}".format(count)] = topic_probability

    return topic_distribution_dict


if __name__ == "__main__":
    sc = SparkContext(appName="Stream Layer", master="local[2]")
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint_stream")

    sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    sc._jsc.hadoopConfiguration().set('fs.s3a.access.key', AWS_ACCESS_KEY_ID)
    sc._jsc.hadoopConfiguration().set('fs.s3a.secret.key', AWS_SECRET_ACCESS_KEY)

    number_of_tweets = sc.accumulator(0)

    # Kafka connection
    brokers = 'localhost:9092'
    topics = ["raw_tweets"]

    kvs = KafkaUtils.createDirectStream(ssc, topics, {"metadata.broker.list": brokers})
    # Kafka emits tuples, so we need to acces to the second element
    tweets = kvs.map(lambda tweet: tweet[1]).cache()

    # save to HDFS
    tweets.foreachRDD(save_stream)

    tweets = tweets.map(lambda tweet: json.loads(tweet))  # Convert strings to dicts
    tweets = parse_tweets(tweets)

Beispiel #23

0

Datei anzeigen

Datei: ChapterSixExample.py Projekt: 153485062/learning-spark

import json
import math
import os

from pyspark import SparkContext
from pyspark import SparkFiles

sparkMaster = sys.argv[1]
inputFile = sys.argv[2]
outputDir = sys.argv[3]

sc = SparkContext(sparkMaster, appName="ChapterSixExample")
file = sc.textFile(inputFile)

# Count lines with KK6JKQ using accumulators
count = sc.accumulator(0)


def incrementCounter(line):
    global count  # Access the counter
    if "KK6JKQ" in line:
        count += 1

file.foreach(incrementCounter)
print "Lines with KK6JKQ %d" % count.value


# Create Accumulator[Int] initialized to 0
blankLines = sc.accumulator(0)
dataLines = sc.accumulator(0)

Beispiel #24

0

Datei anzeigen

    update[edge[0] - 1] = a[edge[1] - 1]
    h_accum.add(update)


class VectorAccumulatorParam(AccumulatorParam):
    def zero(self, value):
        return np.zeros(len(value))

    def addInPlace(self, v1, v2):
        v1 += v2
        return v1


for i in range(40):

    a_accum = sc.accumulator(a, VectorAccumulatorParam())
    edgeList.foreach(lambda edge: update_a(edge, h, a_accum))
    a = a_accum.value

    a = a / np.amax(a)

    h_accum = sc.accumulator(h, VectorAccumulatorParam())
    edgeList.foreach(lambda edge: update_h(edge, a, h_accum))
    h = h_accum.value

    h = h / np.amax(h)

asort = np.argsort(a)
# print(list(a))
print("a Worst: ", asort[:5] + 1)
print("a Best: ", asort[990:] + 1)

Beispiel #25

0

Datei anzeigen

Datei: PageRankDriver.py Projekt: leiyang-mids/MIDS

from pyspark import SparkContext
from subprocess import call

execfile('PageRank.py')

# load original graph file
sc = SparkContext()

#graph_file = sc.textFile('hdfs:///user/leiyang/PageRank-test.txt')
#index_file = sc.textFile('hdfs:///user/leiyang/toy_index.txt')

graph_file = sc.textFile('hdfs:///user/leiyang/all-pages-indexed-out.txt', 80)
index_file = sc.textFile('hdfs:///user/leiyang/indices.txt', 16)

# initialize variables
nDangling = sc.accumulator(0)
lossMass = sc.accumulator(0.0)
damping = 0.85
alpha = 1 - damping
nTop, nIter = 200, 10
start = time()
print '%s: start PageRank initialization ...' %(logTime())
graph = graph_file.flatMap(initialize).reduceByKey(accumulateMass).map(getDangling) #.cache()
# get graph size
G = graph.count()
# broadcast dangling mass for redistribution
p_dangling = sc.broadcast(1.0*nDangling.value/G)
graph = graph.map(redistributeMass)

print '%s: initialization completed, dangling node(s): %d, total nodes: %d' %(logTime(), nDangling.value, G)
# run page rank

Beispiel #26

0

Datei anzeigen

from pyspark import SparkConf
from pyspark import StorageLevel
from pyspark.rdd import RDD
from termcolor import colored
import json

conf = SparkConf().setMaster('local').setAppName('PySparkShell')
sc = SparkContext(conf=conf)

#设置log级别
sc.setLogLevel("WARN")
# spark = SQLContext(sc)

inputFile = 'input_demo.txt'
outputDir = 'i0out.txt'
file = sc.textFile(inputFile)
# 创建Accumulator[Int]并初始化为0
blankLines = sc.accumulator(0)


def extractCallSigns(line):
    global blankLines  # 访问全局变量
    if (line == ""):
        blankLines += 1
    return line.split(" ")


callSigns = file.flatMap(extractCallSigns)
callSigns.saveAsTextFile(outputDir + "/callsigns")
print("Blank lines: %d" % blankLines.value)

Beispiel #27

0

Datei anzeigen

Datei: PageRank.py Projekt: WeiWeic6222848/IR-Assignment2

    # set optimal parameters to run the algorithms
    conf = SparkConf()

    # disable all the timeouts so they don't cause any trouble when running big data
    conf.set("spark.network.timeout", "36001s")
    conf.set("spark.executor.heartbeatInterval", "36000s")
    conf.set("spark.storage.blockManagerSlaveTimeoutMs", "36000s")
    conf.set("spark.worker.timeout", "36000s")
    conf.set("spark.sql.broadcastTimeout", "36000s")

    # give both executors an drivers enough memory so they can execute faster, the exact numbers can be adjusted
    conf.set("spark.executor.memory", "5g")
    conf.set("spark.driver.memory", "8g")
    conf.set("spark.worker.cleanup.enabled", "true")
    sc = SparkContext("local[3]", "PageRanking", conf=conf)
    deadendaccumulator = sc.accumulator(0)

    # filtering mini database to be of form (from node, list of to nodes)
    data = sc.textFile("./Dataset/web-Google.txt").filter(lambda l: not str(l).startswith("#")) \
        .flatMap(addEntries) \
        .reduceByKey(lambda x, y: x + y)

    #filtering for the big dataset
    #data = data.filter(lambda x: int(x[1]) != 0 and int(x[1]) <= 50000000).map(lambda x: (
    #    int(x[1]) - 1,
    #    list(map(lambda x: int(x), filter(lambda x: x != "" and int(x) <= 50000000, str(x[0]).split(" "))))))

    # pass the input to pagerank algorithm
    result = pageRank(data)

    # write result to csv file

Beispiel #28

0

Datei anzeigen

    def CoKNNSVMTrainAndPredictOnSpark(self):
        """
        训练模型，预测结果
        """
        global TOTALFEATURESANDLABEL
        sc = SparkContext(appName="CoKNNSVMTrainAndPredictOnSpark")
        TOTALFEATURESANDLABEL = sc.accumulator([],
                                               ListParamForFeatureAndLabel())
        features = sc.textFile(self.__filepath)

        def makefeatures(line):
            """
            根据“_v”切分出类别信息
            :param line:关键帧的特征
            """
            classname = os.path.basename(line[0]).split("_v")[0]
            classnum = self.__classmap[classname]
            return (float(classnum), [float(x) for x in line[1]])

        def getmodelandaccuary(line):
            """
            训练模型，预测结果
            :param line: hdfs上的要读取的features目录的目录
            :return: 准确率
            """
            global TOTALFEATURESANDLABEL
            TOTALFEATURESANDLABEL += [(line[0], line[1])]

        # features.map(lambda x:x.split(" ")).map(getmodelandaccuary).repartition(1).saveAsTextFile(self.__savepath)
        features.map(lambda x: x.split(" ")).map(lambda x: (x[1], x[2:])).map(
            makefeatures).map(getmodelandaccuary).count()
        totalfeaturesandlabel = TOTALFEATURESANDLABEL.value

        def getfeaturelistandlabellist(totalfeaturesandlabel):
            """
            把累加器中的label和特征的元组提出来，形成标签list和featrueslist
            :param totalfeaturesandlabel:label和特征的元组
            :return:（标签list，featrueslist）
            """
            TOTALFEATURES = []
            TOTALLABEL = []
            for i in range(0, len(totalfeaturesandlabel)):
                TOTALLABEL.append(totalfeaturesandlabel[i][0])
                TOTALFEATURES.append(totalfeaturesandlabel[i][1])
            return (TOTALLABEL, TOTALFEATURES)

        totallabel, totalfeatures = getfeaturelistandlabellist(
            totalfeaturesandlabel)
        # y = totallabel
        # x = totalfeatures

        # x = Co_KNN_SVM_Utilities.getfeatureforlibsvm(x)

        random_index = [i for i in range(len(totallabel))]
        # test_random_index = [i for i in range(len(x))]
        random.shuffle(random_index)
        # random.shuffle(test_random_index)
        random_y = [totallabel[x] for x in random_index]
        random_x = [totalfeatures[x] for x in random_index]
        # random_test_y = [test_y[x] for x in test_random_index]
        # random_test_x = [test_x[x] for x in test_random_index]
        # random_train_y = [train_y[x] for x in train_random_index]
        # random_train_x = [train_x[x] for x in train_random_index]
        # random_test_y = [test_y[x] for x in test_random_index]
        # random_test_x = [test_x[x] for x in test_random_index]
        # random_train_y = train_y
        # random_train_x = train_x
        # random_test_y = test_y
        # random_test_x = test_x
        # train_y = random_y[0:1500]
        # train_x = random_x[0:1500]
        # test_y = random_y[1500:1580]
        # test_x = random_x[1500:1580]
        train_x, test_x, train_y, test_y = train_test_split(totalfeatures,
                                                            totallabel,
                                                            test_size=0.2,
                                                            shuffle=False)
        # train_y = totallabel[0:800]
        # train_x = totalfeatures[0:800]
        # test_y = totallabel[800:1580]
        # test_x = totalfeatures[800:1580]
        Co_KNN_SVM_New.Co_KNN_SVM(train_y, train_x, test_y, test_x,
                                  self.__savepath)

Beispiel #29

0

Datei anzeigen

Datei: multistage.py Projekt: ruggfrancesco/market-basket-analysis

			return Counter()
		def addInPlace(self, hashdict, items):
			hashdict.update(items)
			return hashdict

if __name__ == '__main__':
	sc = SparkContext(appName="FraMultiStage")

	data_input = 's3://progetto-analisi-di-dati-unimi/dataset'
	data_output = 's3://progetto-analisi-di-dati-unimi/output_multistage/'
	split_by, supp, combsize = ',', 18000, 2
	
	data = sc.textFile(data_input).map(lambda x: sorted(set(x.split(split_by))))

	#Converting item names to number
	item_to_n = sc.accumulator(dict(), DictAccumulatorParam())
	data.foreach(item_to_n.add)
	item_to_n = item_to_n.value
	data = data.map(lambda x: [item_to_n[i] for i in x])

	#Hashmaps and their hash functions
	hashmap1 = sc.accumulator(Counter(), HashMapAccumulator())
	hashmap2 = sc.accumulator(Counter(), HashMapAccumulator())

	#Get frequent items from bucket
	def getFreq(bucket): return filter(lambda i: i in freq, bucket)

	#Hashmaps functions for each hashmap
	def hashf1(x): return sum(x) % 21243
	def hashf2(x): return sum(x) % 10621

Beispiel #30

0

Datei anzeigen

from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("DegreesOfSeparation")
sc = SparkContext(conf=conf)

start_character_id = 5306  # Spiderman
target_character_id = 14  # Adam

hit_counter = sc.accumulator(0)


def convert_to_bfs(line):
    fields = line.split()
    hero_id = int(fields[0])
    connections = []
    for connection in fields[1:]:
        connections.append(int(connection))

    color = 'WHITE'
    distance = 9999

    if(hero_id == start_character_id):
        color = 'GRAY'
        distance = 0

    return (hero_id, (connections, distance, color))


def create_starting_rdd():
    input_file = sc.textFile("../data/Marvel-graph.txt")
    return input_file.map(convert_to_bfs)

Beispiel #31

0

Datei anzeigen

    x = re.sub("'", '', x)
    return re.sub('[?!@#$\'",.;:()]', '', x).lower()


def countWord(line):
    global count_number
    if (line == "Tokyo"):
        count_number += 1
    return line.split(' ')


if __name__ == "__main__":
    if len(sys.argv) < 4:
        print >> sys.stderr, "Usage: wordcount <master> <inputfile> <outputfile>"
        exit(-1)
    sc = SparkContext(sys.argv[1],
                      "python_wordcount_sorted in bigdataprogrammiing")
    lines = sc.textFile(sys.argv[2], 2)
    count_number = sc.accumulator(0)

    counts = lines\
    .flatMap(countWord)\
    .filter(lambda x: x!="Tokyo")\
    .map(lambda x: (x.lower(), 1))\
    .reduceByKey(lambda x,y:x+y)\
    .sortByKey(ascending=True)

    counts.saveAsTextFile("hdfs://localhost:9000/output")
    print('Number of Tokyo : ', count_number.value)
    sc.stop()

Beispiel #32

0

Datei anzeigen

Datei: dsgd_mf.py Projekt: kavyasrinet/SGD-for-Matrix-Factorization-using-Spark

def main():
    num_factors = int(sys.argv[1])
    num_workers = int(sys.argv[2])
    num_itrns = int(sys.argv[3])
    beta_value = float(sys.argv[4])
    lambda_value = float(sys.argv[5])
    inputV_file = sys.argv[6]
    outputW_file = sys.argv[7]
    outputH_file = sys.argv[8]

    #initialize spark context
    #using conf
    #conf = new SparkConf().setAppName("My application").setMaster("local")
    #sc = new SparkContext(conf=conf)
    #using only SparkContext
    sc = SparkContext("local", "dsgd_mf")
    #use accumulator to update values of teh loss over every iteration
    L_NZSL = sc.accumulator(0)
    print "L_NZSL value is ", str(L_NZSL.value)

    #read directory or file
    if os.path.isdir(inputV_file):
        #read data and get it from the input files
        RDD = get_data_in_RDD_folder(sc, inputV_file)
        #construct the V matrix, the user and movie hashmaps
        RDD_matrix_V, user_Hashmap, movie_Hashmap, N_j, N_i = get_matrix_V_from_netflix(
            RDD)

    elif os.path.isfile(inputV_file):
        #read data and get it from the input files
        RDD = get_data_in_RDD_file(sc, inputV_file)
        #construct the V matrix, the user and movie hashmaps
        RDD_matrix_V, user_Hashmap, movie_Hashmap, N_j, N_i = get_matrix_V_from_autolab(
            RDD)

    start_time = time.time()
    print "Time is ", str(start_time)

    #persist the ratings matrix as we will be using this throughout
    RDD_matrix_V = RDD_matrix_V.persist()

    #construct initial W and H matrices with random value between 0 and 1
    RDD_matrix_W, RDD_matrix_H = construct_initial_factors(
        sc, user_Hashmap, movie_Hashmap, num_factors)

    #now partition the data ( V and W)
    partition_W = RDD_matrix_W.partitionBy(num_workers).persist()
    partition_V = RDD_matrix_V.partitionBy(num_workers).persist()

    l_values = []  #stores the value of loss over every iteration
    block_size = len(movie_Hashmap.keys(
    )) / num_workers  #size of the block is number of movies/ num of workers

    total_n = 0
    #here my iteration below are over each diagonal , and not over the whole matrix at once, hence my iterations
    #is the product of the iteraion over the whole data once times the number of workers

    new_itrns = num_itrns * num_workers
    #Following are te initial values to keep track of convergence of W and H for autolab data
    prev_W_intersect = 1000000000
    prev_H_intersect = 1000000000
    #Now for each iteration
    for i in range(0, new_itrns):
        #get strata for this iteration by joining with W
        partitions = partition_V.join(partition_W, numPartitions=num_workers)
        #now map partition with index
        strata_for_this = partitions.mapPartitionsWithIndex(
            get_partition, preservesPartitioning=True)
        #Filter the strata now
        new_block_strata = strata_for_this.filter(
            partial(get_block, i, block_size, num_workers))
        #now get movie id and data
        #get the map for H (movies)
        new_map = RDD_matrix_H.collectAsMap()

        #here, call the update_WH function now which performs the gradient update
        updated_W_and_H = new_block_strata.mapPartitions(
            partial(update_WH, lambda_value, N_i, N_j, num_workers, new_map,
                    beta_value, total_n, L_NZSL),
            preservesPartitioning=True)
        #got updated maps from different blocks in parallel
        total_n = total_n + new_block_strata.count()
        #get RDD of new updated W and H
        W_list = updated_W_and_H.flatMap(lambda x: x[0]).collect()
        H_list = updated_W_and_H.flatMap(lambda x: x[1]).collect()
        RDD_matrix_W_new = sc.parallelize(W_list).sortByKey()
        RDD_matrix_H_new = sc.parallelize(H_list).sortByKey()
        #Compute the square of difference on W and H , for convergence
        RDD_intersect_W = sum(
            RDD_matrix_W_new.join(RDD_matrix_W).map(
                lambda x: (x[1][0] - x[1][1])**2).sum())
        RDD_intersect_H = sum(
            RDD_matrix_H_new.join(RDD_matrix_H).map(
                lambda x: (x[1][0] - x[1][1])**2).sum())

        #Update and construct the new W and H for next iteration , some entries are replicated when a user appears multiple times in a strats
        RDD_matrix_W = RDD_matrix_W_new.union(
            RDD_matrix_W.subtractByKey(RDD_matrix_W_new))
        RDD_matrix_H = RDD_matrix_H_new.union(
            RDD_matrix_H.subtractByKey(RDD_matrix_H_new))
        RDD_matrix_W = RDD_matrix_W.sortByKey()
        RDD_matrix_H = RDD_matrix_H.sortByKey()
        partition_W = RDD_matrix_W.partitionBy(num_workers)

        #if the whole matrix has been covered (the input value of iteration)
        if (i + 1) % num_workers == 0:
            #update and check for convergence
            #if prev_W_intersect - RDD_intersect_W <= 0.00001 and prev_H_intersect - RDD_intersect_H <= 0.00001:
            #    print "W diff "+str(RDD_intersect_W - prev_W_intersect)
            #    print "H diff "+str(RDD_intersect_H - prev_H_intersect)
            #    print "converged"
            #    break
            #prev_W_intersect = RDD_intersect_W
            #prev_H_intersect = RDD_intersect_H
            #print "W diff new "+str(prev_W_intersect)
            #print "H diff new "+str(prev_H_intersect)
            #record the loss for this iteration and reset accumulator to zero
            l_values.append(L_NZSL)
            L_NZSL = sc.accumulator(0)
    #Now write out the new W and H to the files mentioned
    for l in l_values:
        print "iteration: " + str(i) + " L_NZSL value is " + str(l.value)
        i = i + 1
    print
    print

    #Now write out the final W and H matrices to the mentioned files
    write_matrix(RDD_matrix_H, RDD_matrix_W, outputW_file, outputH_file,
                 user_Hashmap, movie_Hashmap, num_factors)

    print "time taken  was ", str(start_time - time.time())

Beispiel #33

0

Datei anzeigen

Datei: getAggLoad.py Projekt: carriercomm/bellLabs

    os.makedirs(input_dir + subDir)
    for i in range(len(inputFiles)):
         f = inputFiles[i]
         if subFileCount==filePartitionSize:
              subDirs.append(subDir)
              subDirNum += 1
              subFileCount = 0
              subDir = str(subDirNum) + "/"
              os.makedirs(input_dir + subDir)
         shutil.move(input_dir + f,input_dir + subDir)
         subFileCount += 1
    if subFileCount==filePartitionSize:
         subDirs.append(subDir)
    
    sc = SparkContext("local[" + numCores + "]" , "job", pyFiles=[realpath('helper.py')])
    timeLoads = sc.accumulator([0]*len(intervals), VectorAccumulatorParam())

    prev_idx = 0
    numBS = 0
    for i in range(len(subDirs)):
         d = subDirs[i]
         end_idx = intervals.index(dirTimeBoundaries[i])
         intervalBoundary = (prev_idx+1,end_idx) #both indexes are included
         prev_idx = end_idx

         bs2data = sc.textFile(input_dir + d + '*.gz').filter(filterData).map(generateBS2Data).reduceByKey(reduceBS2IMSI2Data)
         bs2data.foreach(getAccumLoad)
         if (bs2data.count() >= numBS):
              numBS = bs2data.count()
    
    mean = [float(x)/numBS for x in timeLoads.value]

Beispiel #34

0

Datei anzeigen

Datei: day3.py Projekt: Mr-Ycccc/pyspark_10day

#action操作
print('------------------action-------------')
#collect返回数据到driver，数据量大时会超出内存风险
print(rdd.collect())
#take返回若干个，类似limit？
print(rdd.take(4))
#takesample返回随机若干个
print(rdd.takeSample(False, 5, 0))
#first返回第一个
print(rdd.first())
#count返回rdd中元素数量
print(rdd.count())
#redunce (1+2)+3+4+5...依次累加
print(rdd.reduce(lambda x, y: x + y))
#foreach对每个元素执行，不生成新rdd，和map功能一样？，action和transformation的区别？
accum = sc.accumulator(0)
rdd.foreach(lambda x: accum.add(x))
print(accum.value)
#countbykey 对rdd按key统计数量
pairrdd = sc.parallelize([
    (1, 1),
    (1, 4),
    (2, 1),
    (3, 1),
    (1, 6),
])
print(pairrdd.countByKey())

#transformation操作
print('------------------transformation-------------')
#map对每个元素进行操作

Beispiel #35

0

Datei anzeigen

Datei: momentum2.py Projekt: minhpascal/finopt

            "Usage: to gracefully shutdown type echo 1 > /tmp/flag at the terminal"
        )
        exit(-1)

    app_name = "Momentum"
    sc = SparkContext(appName=app_name)  #, pyFiles = ['./cep/redisQueue.py'])
    ssc = StreamingContext(sc, 2)
    ssc.checkpoint('../checkpoint')

    brokers, qname, id, fn = sys.argv[1:]
    id = int(id)

    #
    # demonstrate how to use broadcast variable
    #
    NumProcessed = sc.accumulator(0)
    Q = sc.broadcast({
        'rname': 'rname',
        'qname': qname,
        'namespace': 'mdq',
        'host': 'localhost',
        'port': 6379,
        'db': 3,
        'alert_bot_q': ('msg_bot', 'chatq')
    })
    Threshold = sc.broadcast(0.00015)
    #kvs = KafkaUtils.createDirectStream(ssc, ['ib_tick_price', 'ib_tick_size'], {"metadata.broker.list": brokers})
    kvs = KafkaUtils.createStream(ssc, brokers, app_name, {
        'ib_tick_price': 1,
        'ib_tick_size': 1
    })

Beispiel #36

0

Datei anzeigen

Datei: foreach.py Projekt: cpatrick/NEX

import numpy as np

srtm_dtype = np.dtype('>i2')
filename_regex = re.compile('([NSEW]\d+[NSEW]\d+).*')

# The data directory, needs to be available to all node in the cluster
data_files = '/media/bitbucket/srtm/version2_1/SRTM3/North_America'

# Build up the context, using the master URL
sc = SparkContext('spark://ulex:7077', 'srtm')

# Now load all the zip files into a RDD
data = sc.binaryFiles(data_files)

# The two accumulators are used to collect values across the cluster
num_samples_acc = sc.accumulator(0)
sum_acc = sc.accumulator(0)

# Function to array
def read_array(data):
    hgt_2darray = np.flipud(np.fromstring(data, dtype=srtm_dtype).reshape(1201, 1201))

    return hgt_2darray

# Function to process a HGT file
def process_file(file):
    (name, content) = file

    filename = os.path.basename(name)
    srtm_name = filename.split('.')[0]
    match = filename_regex.match(srtm_name)

Beispiel #37

0

Datei anzeigen

Datei: dsgd_mf.py Projekt: manjusaka077/DSGD-spark

	#Create W H V
	W, H = CreateHW()
	num_users = W.shape[0]
	num_movies = H.shape[1]
	V = CreateMatrix(num_users, num_movies)

	# Initialize sc
	conf = SparkConf().setAppName('DSGD').setMaster('local[%d]' % num_workers)
	sc = SparkContext(conf=conf)

	# Intialize strata
	init_strata = [[i, v] for i, v in enumerate(np.random.permutation(num_workers))]
	S = sc.parallelize(init_strata)

	# Initialize clock
	clock = sc.accumulator(0)

	# Iteration
	for i in xrange(num_iterations) :
		# Get rows, cols from strata		
		split = S.map(GetRowCol).collect()
		# Get block from rows, cols
		matrices = []
		for row, col in split :
			V_block = V.tocsr()[row, :].tocsc()[:, col]
			W_block = W[row, :].copy()
			H_block = H[:, col].copy()
			matrices.append((V_block, W_block, H_block))
		# Set clock
		clk = clock.value
		# Calculate gradient

Beispiel #38

0

Datei anzeigen

Datei: count_keys.py Projekt: svebk/dig-lsh-clustering

#!/usr/bin/env python

from pyspark import SparkContext
import sys



if __name__ == "__main__":
    def extractNumLines(line):
        global lines
        lines += 1
        # print lines, ":", line, "\n\n"
        return line

    sc = SparkContext(appName="CountKeys")
    file = sc.sequenceFile( sys.argv[1])
    rdd = file.reduceByKey(lambda x, y:  x)

    lines = sc.accumulator( 0)
    num_lines = rdd.map(extractNumLines)
    num_lines.collect()
    print "Num lines: %d" % lines.value

Beispiel #39

0

Datei anzeigen

        send_email(EMAIL_DESTINATION, NOTIFICATION_MESSAGE)
    elif (acc1.value % 10 != 0 and acc2.value == 1):
        if (acc2.value > 0):
            acc2.add(-1)


conf = SparkConf().setAppName("Arduino Notification").setMaster('local[*]')

sparkContext = SparkContext(conf=conf)
sparkContext.setLogLevel("ERROR")

streamingContext = StreamingContext(sparkContext, 1)

dstream = streamingContext.socketTextStream(SOCKET_HOST, SOCKET_PORT)
# Micro Batches

# Sensor stream
count_sensor_read = sparkContext.accumulator(0)
status_mailed = sparkContext.accumulator(0)

data = dstream.filter(lambda _data: float(_data) <= WATER_LEVEL_THRESHOLD)

data.foreachRDD(lambda rdd: send_mail(rdd, count_sensor_read, status_mailed))

data.pprint()
# End of sensor stream
# End of micro batches

streamingContext.start()
streamingContext.awaitTermination()

Beispiel #40

0

Datei anzeigen

Datei: AverageWordLength.py Projekt: hagenhaus/my-spark-examples

import sys
from pyspark import SparkContext

if __name__ == "__main__":
  if len(sys.argv) != 2:
    print >> sys.stderr, "Usage: AverageWordLength <file or directory>"
    exit(-1)
  
  sc = SparkContext()
  totalWords = sc.accumulator(0)
  totalLetters = sc.accumulator(0.0)
  words = sc.textFile(sys.argv[1]).flatMap(lambda line: line.split())
  def addTotals(word,words,letters):    
    words +=1
    letters += len(word)
  words.foreach(lambda word: addTotals(word,totalWords,totalLetters))
  print "Average word length:", totalLetters.value/totalWords.value

Beispiel #41

0

Datei anzeigen

Datei: chunk.py Projekt: notthatbreezy/oam-server-tiler

def run_spark_job(tile_dim):
    from pyspark import SparkConf, SparkContext
    from pyspark.accumulators import AccumulatorParam

    class ImageSourceAccumulatorParam(AccumulatorParam):
        """
        Accumulator that will collect our image data that will be
        included as part of the input to the next stage of processing.
        """
        def zero(self, dummy):
            return []

        def addInPlace(self, sources1, sources2):
            res = []
            if sources1:
                res.extend(sources1)
            if sources2:
                res.extend(sources2)
            return res

    request_uri = sys.argv[1]

    # If there's more arguements, its to turn off notifications
    publish_notifications = True
    if len(sys.argv) == 3:
        publish_notifications = False

    parsed_request_uri = urlparse(request_uri)
    request = None
    if not parsed_request_uri.scheme:
        request = json.loads(open(request_uri).read())
    else:
        client = boto3.client("s3")
        o = client.get_object(Bucket=parsed_request_uri.netloc, Key=parsed_request_uri.path[1:])
        request = json.loads(o["Body"].read())

    source_uris = request["images"]
    workspace = request["workspace"]
    jobId = request["jobId"]
    target = request["target"]

    if publish_notifications:
        notify_start(jobId)

    try:
        uri_sets = create_uri_sets(source_uris, workspace)
        image_count = len(uri_sets)

        conf = SparkConf().setAppName(APP_NAME)
        sc = SparkContext(conf=conf)

        image_source_accumulator = sc.accumulator([], ImageSourceAccumulatorParam())

        def create_image_sources(uri_set, acc):
            image_source = create_image_source(uri_set.source_uri, uri_set.workspace_source_uri, uri_set.image_folder, uri_set.order, tile_dim)
            acc += [image_source]
            return image_source

        def uri_set_copy(uri_set):
            copy_to_workspace(uri_set.source_uri, uri_set.workspace_target)
            return uri_set

        uri_set_rdd = sc.parallelize(uri_sets, image_count).map(uri_set_copy)
        image_sources = uri_set_rdd.map(lambda uri_set: create_image_sources(uri_set, image_source_accumulator))
        chunk_tasks = image_sources.flatMap(lambda image_source: generate_chunk_tasks(image_source, tile_dim))
        chunks_count = chunk_tasks.cache().count()
        numPartitions = max(chunks_count / 10, min(50, image_count))

        chunk_tasks.repartition(numPartitions).foreach(process_chunk_task)

        image_sources = image_source_accumulator.value
        print "Processed %d images into %d chunks" % (len(image_sources), chunks_count)

        input_info = map(construct_image_info, sorted(image_sources, key=lambda im: im.order))

        result = {
            "jobId": jobId,
            "target": target,
            "tileSize": tile_dim,
            "input": input_info
        }

        # Save off result
        workspace_parsed = urlparse(workspace)
        if not workspace_parsed.scheme:
            # Save to local files system
            open(os.path.join(workspace, OUTPUT_FILE_NAME), 'w').write(json.dumps(result))
        elif workspace_parsed.scheme == "s3":
            client = boto3.client("s3")

            bucket = workspace_parsed.netloc
            key = os.path.join(workspace_parsed.path, OUTPUT_FILE_NAME)[1:]

            client.put_object(Bucket=bucket, Key=key, Body=json.dumps(result))
    except Exception, e:
        if publish_notifications:
            notify_failure(jobId, "%s: %s" % (type(e).__name__, e.message))
        raise

Beispiel #42

0

Datei anzeigen

Datei: StackExample.py Projekt: slopezv2/Spark-Learning

from pyspark import SparkContext, SparkConf
from Utils import Utils

if __name__ == "__main__":
    conf = SparkConf().setAppName('StackOverflow Example').setMaster(
        "local[*]")
    sc = SparkContext(conf=conf)
    total = sc.accumulator(0)
    missingSalaryMidPoint = sc.accumulator(0)
    responseRDD = sc.textFile("2016-stack-overflow-survey-responses.csv")
    processedBytes = sc.accumulator(0)

    def filterResponseFromCanada(response):
        processedBytes.add(len(response.encode('utf-8')))
        splits = Utils.COMMA_DELIMITER.split(response)
        total.add(1)
        if not splits[14]:
            missingSalaryMidPoint.add(1)
        return splits[2] == "Colombia"

    responseFromCanada = responseRDD.filter(filterResponseFromCanada)
    print("Count of responses from Colombia: {}".format(
        responseFromCanada.count()))
    print("Total count of responses {}".format(total.value))
    print("Count of responses missing salary middle point: {}".format(
        missingSalaryMidPoint.value))
    print("Number of bytes processed: {}".format(processedBytes))

Beispiel #43

0

Datei anzeigen

Datei: zaqar_rdd_sanity_checker.py Projekt: mattf/sparkhara-sources

ZAQAR_URL='http://10.0.1.107:8888/'
ZAQAR_VERSION=1.1

def get_client():
    return zaqarclient.Client(ZAQAR_URL, ZAQAR_VERSION, conf=conf)

def total_emitter(acc):
    client = get_client()
    queue = client.queue('log_totals')
    while True:
        time.sleep(5)
        queue.post({'body': acc.value, 'ttl': 300})

if __name__ == '__main__':
    sc = SparkContext(appName='SparkharaLogCounter')
    ssc = StreamingContext(sc, 1)

    total_lines = sc.accumulator(0)

    def rdd_print(rdd):
        a = rdd.collect()
        total_lines.add(len(a))

    lines = ssc.socketTextStream('0.0.0.0', 9901)
    lines.foreachRDD(rdd_print)

    th = threading.Thread(target=total_emitter, args=(total_lines,))
    th.start()
    ssc.start()
    ssc.awaitTermination()

Beispiel #44

0

Datei anzeigen

Datei: Telkomsel_msc_nokia.py Projekt: luotigerlsx/DataAnalysis_ML

def main(input_path, output_path):

    sc = SparkContext(appName='Data_Analysis')

    '''
    Define spark accumulators for counting total records,
    valid records and empty string checking
    '''
    empty_records = {}  # accumulator check the emptiness of columns
    for column in CHECK_EMPTY_COLUMNS:
        empty_records[column] = sc.accumulator(0)

    total_records = sc.accumulator(0)
    valid_records = sc.accumulator(0)

    # collection all desired statistics
    stats_collector = {}

    # load raw dataset
    raw_rdd = sc.textFile(input_path).map(lambda x: x.split('|'))
    # vaidate whether data fulfill the definition of data dictionary
    validate_rdd = raw_rdd.filter(lambda x: data_validate(x, VALIDATION_LIST, total_records, valid_records))

    '''
    Keep only the following columns: 'IMSI', 'EVENT_TYPE', 'CGI', 'DATETIME', 'BBC'
    '''
    # load fixed cell master file for cgi and bbc mapping
    cell_master_dict = {}
    with open(CELL_MASTER_FILE, 'r') as f:
        for line in f:
            line = line.strip()
            line = line.split('|')
            cell_master_dict[line[0]] = line[9]

    transform_rdd = validate_rdd.map(lambda x: data_tranform(x, cell_master_dict, empty_records))

    '''
    Filter out records for JABODETABEK
    '''
    Jabodetabek_rdd = transform_rdd.filter(lambda x: x[-1] == 'JABODETABEK').cache()
    stats_collector['Jabodetabek_records'] = Jabodetabek_rdd.count()
    print 'Number of Jabodetabek records: %d' % stats_collector['Jabodetabek_records']

    stats_collector['total_records'] = total_records.value
    print 'Number of total records: %d' % stats_collector['total_records']

    stats_collector['valid_records'] = valid_records.value
    print 'Number of valid records: %d' % stats_collector['valid_records']

    stats_collector['empty_records'] = {}
    if stats_collector['valid_records'] != 0:
        for column in CHECK_EMPTY_COLUMNS:
            stats_collector['empty_records'][column] = empty_records[column].value
            print 'Empty record in column %s is %d, with percentage %.2f' % (column, empty_records[column].value,
                                                                             empty_records[column].value / float(
                                                                                 stats_collector['valid_records']))

    if stats_collector['Jabodetabek_records'] > 0:

        '''
        Generate the aggregate count distribution over time
        '''
        # aggregate count distribution over time per event type
        event_datetime_pair = Jabodetabek_rdd.map(lambda x: ((x[1], round_datetime(x[3], 10, 'minutes')), 1))
        agg_event_datetime_pair = event_datetime_pair.reduceByKey(lambda x, y: x + y).sortByKey(ascending=True)
        stats_collector['event_time_distribution'] = {}
        for item in agg_event_datetime_pair.collect():
            if item[0][0] not in stats_collector['event_time_distribution']:
                stats_collector['event_time_distribution'][item[0][0]] = [[item[0][1], item[1]]]
            else:
                stats_collector['event_time_distribution'][item[0][0]].append([item[0][1], item[1]])
        print 'Event time records distribution: %s' % str(stats_collector['event_time_distribution'])

        # overall aggregate count distribution over time
        event_time_distribution = stats_collector['event_time_distribution']
        event_list = event_time_distribution.keys()

        time_series_dict = dict()
        for event_type in event_list:
            for item in event_time_distribution[event_type]:
                if item[0] not in time_series_dict:
                    time_series_dict[item[0]] = item[1]
                else:
                    time_series_dict[item[0]] += item[1]

        time_distribution = [[t, time_series_dict[t]] for t in time_series_dict]
        time_distribution = sorted(time_distribution, key=lambda x: x[0])

        stats_collector['time_distribution'] = time_distribution
        print 'Time records distribution: %s' % str(stats_collector['time_distribution'])

        # Group by records according different event
        stats_collector['event_distribution'] = [[e_type, sum([event_time_distribution[e_type][i][1] for i in
                                                               range(len(event_time_distribution[e_type]))])]
                                                 for e_type in event_list]
        print 'Event distribution is: %s' % stats_collector['event_distribution']


        '''
        Convert RDD to dataframe
        '''
        sql_context = SQLContext(sc)
        fields = [StructField(field_name, StringType(), True) for field_name in SCHEMA_NAMES]
        schema = StructType(fields)
        df = sql_context.createDataFrame(Jabodetabek_rdd, schema).cache()

        '''
        Unique IMIS per hour
        '''
        udf_round_datetime = udf(round_datetime, StringType())
        df = df.select(df['*'], udf_round_datetime(df['DATETIME']).alias('Hour'))
        stats_collector['imsi_per_hour'] = (df.groupBy('Hour').agg(countDistinct('IMSI').alias('UNIQUE_IMSI'))
                                            .rdd.map(lambda x: [x['Hour'], x['UNIQUE_IMSI']]).collect())

        '''
        Number of records per imsi distribution
        '''
        count_by_imsi = df.groupBy('IMSI').count().selectExpr('count as num_records') # ['IMSI', 'num_records']
        stats_collector['total_imsi'] = count_by_imsi.count()
        print 'Total number of imsi is %d' % stats_collector['total_imsi']

        stats = count_by_imsi.selectExpr('avg(num_records) as mean', 'stddev(num_records) as std').collect()
        mean_value = stats[0]['mean']
        std_value = stats[0]['std']
        upper_limit = mean_value + 3 * std_value # mean + 3 * standard deviation

        count_by_imsi_filtered = count_by_imsi.filter(count_by_imsi['num_records'] <= upper_limit)
        agg_by_count = count_by_imsi_filtered.groupBy('num_records').count()

        stats_collector['imsi_distribution'] = agg_by_count.rdd.map(lambda x: [x['num_records'], x['count']]).collect()
        print 'IMSI distribution is: %s' % stats_collector['imsi_distribution']

        output_rdd = sc.parallelize([json.dumps(stats_collector)])
        output_rdd.saveAsTextFile(output_path)
    else:
        output_rdd = sc.parallelize([json.dumps(stats_collector)])
        output_rdd.saveAsTextFile(output_path)
        raise ValueError('No record for Jabodetabek Found.')

Beispiel #45

0

Datei anzeigen

    if i % 2 == 0:
        det += sum
    else:
        det += (0 - sum)


if __name__ == "__main__":

    global rows
    global mat
    rows = 3

    sc = SparkContext("local", "Determinant")

    # accumulator variable to accumulate final determinant value
    det = sc.accumulator(0)

    # dense matrix returns matrix in column major format hence
    # the entered values itself is fiven in column major so that
    # we can finally have a  row major matrix to operate on
    dm2 = Matrices.dense(rows, rows, [2, 7, 3, 3, 7, 8, 5, 8, 5])

    print "\n\nEntered matrix:\n", dm2.toArray()

    #here we are trying to divide work between workers. we divide first row
    # between them (calculate partial determinant for each item in first row)
    cols = sc.parallelize([i for i in range(0, rows)])

    mat = dm2.toArray()

    cols.foreach(dist_deter)

Beispiel #46

0

Datei anzeigen

    if len(x[1].split(",")) > 1:
        return [(x[0], str(len(x[1].split(","))) + "/" + x[1])]
    else:
        return []


from pyspark import SparkContext

if __name__ == '__main__':
    # initialize
    sc = SparkContext("yarn", "labelp")
    # reduceByKey() uses ',' to collect all followers of a user
    p_list = sc.textFile("s3://spark-llh/inputfile/edges.csv")\
    .coalesce(100).map(divide).reduceByKey(lambda a,b:a+","+b).map(add_plus)
    # initialize accumulator
    p_count = sc.accumulator(0)
    while 1:
        p_list = p_list.coalesce(100).flatMap(p_check).union(
            p_list).reduceByKey(p_update)
        p_list.count()  # an action to trigger transformations and accumulator
        if p_count.value == 0:
            break
        p_count.value = 0
    n_list = sc.textFile("s3://spark-llh/inputfile/edges.csv")\
    .coalesce(100).map(reverse).reduceByKey(lambda a,b:a+","+b).map(add_minus)
    n_count = sc.accumulator(0)
    while 1:
        n_list = n_list.coalesce(100).flatMap(n_check).union(
            n_list).reduceByKey(n_update)
        n_list.count()
        if n_count.value == 0:

Beispiel #47

0

Datei anzeigen

Datei: accumulator.py Projekt: MiguelPeralvo/distributed-misc

from pyspark import SparkContext

sc = SparkContext('spark://master:7077', 'accumulator example')
# accumulators are initialized with a initial value
# they have and add method to add values to the accumulator
# and a value property that is visibile only to the master

accum = sc.accumulator(0)
data = sc.parallelize(range(1,1000))

# we are going to iterate over our data and add each value to the 
# accumulator

data.foreach(lambda value: accum.add(value))

print accum.value

Beispiel #48

0

Datei anzeigen

def map_phase(x):
    x = re.sub('--', ' ', x)
    x = re.sub("'", '', x)
    return re.sub('[?!@#$\'",.;:()]', '', x).lower()


def filter_tokyo(line):
    line = map_phase(line)
    global tokyo_count
    if line == 'tokyo':
        tokyo_count += 1
    return (line, 1)


if __name__ == "__main__":
    if len(sys.argv) < 4:
        print >> sys.stderr, "Usage: wordcount <master> <inputfile> <outputfile>"
        exit(-1)
    sc = SparkContext(sys.argv[1],
                      "python_wordcount_sorted in bigdataprogrammiing")
    tokyo_count = sc.accumulator(0)
    lines = sc.textFile(sys.argv[2], 2)
    print(lines.getNumPartitions())  # print the number of partitions
    outRDD = lines.map(filter_tokyo)
    outRDD = outRDD.reduceByKey(add)
    outRDD = outRDD.filter(lambda x: x[0].find('neighborhood') == -1)
    outRDD = outRDD.filter(lambda x: x[0].find('tokyo') == -1)
    outRDD = outRDD.sortBy(lambda x: x[1])
    outRDD.saveAsTextFile(sys.argv[3])
    print("Number of Tokyo : {}".format(tokyo_count))

Beispiel #49

0

Datei anzeigen

Datei: dsgd_mf.py Projekt: adityagabbita/spark-mat-fac

	W_rdd = sc.parallelize(range(num_users+1))
	W_rdd = W_rdd.map(lambda x: (x, [random.uniform(0, 5) for _ in range(0, num_factors)])).keyBy(lambda entry: entry[0]/users_per_w_block.value)
	W_rdd = W_rdd.partitionBy(num_workers, lambda key: key).persist()

	#construct the H array
	H_rdd = sc.parallelize(range(num_movies+1))
	H_rdd = H_rdd.map(lambda x: (x, [random.uniform(0, 5) for _ in range(0, num_factors)])).keyBy(lambda entry: entry[0]/movies_per_h_block.value)
	#H_rdd = H_rdd.partitionBy(num_workers, partition_h).persist()

	#broadcast beta and lambda
	beta_br = sc.broadcast(beta_value)
	lambda_br = sc.broadcast(lambda_value)

	total_updates = sc.broadcast(0)
	curr_stratum = sc.broadcast(0)
	last_iter_total = sc.accumulator(0)

	#SGD begins
	for iter in range(num_iterations):
		#filter current stratum data
		stratum_V_rdd = V_rdd.filter(lambda entry: entry[1][1]==curr_stratum.value)

		#partition H
		H_rdd = H_rdd.map(lambda entry: (pattern_br.value[curr_stratum.value][(entry[1][0]/movies_per_h_block.value)],entry[1]))
		H_rdd = H_rdd.partitionBy(num_workers, lambda key: key).persist()

		#group V, W and H into a stratum
		stratum_rdd = stratum_V_rdd.groupWith(W_rdd,H_rdd).partitionBy(num_workers, lambda key: key).persist()

		#parallel SGD on strata
		stratum_rdd = stratum_rdd.map(sgd_func, True)

Beispiel #50

0

Datei anzeigen

Datei: generate_location_info.py Projekt: 28Roland/new-one

        qk = quadkey.from_geo((latitude, longitude), 15)
        acc_num_good_records.add(1)
        return "{},{},{},{},{},{}".format(record, country, city, latitude,
                                          longitude, qk.key)

    except:
        acc_num_bad_records.add(1)
        return "-----"


if __name__ == "__main__":
    sc = SparkContext()

    outputPath = "hdfs://localhost/user/cloudera/audi_case_study/location_info_added"

    reader = None

    acc_num_bad_records = sc.accumulator(0)

    acc_num_good_records = sc.accumulator(0)

    records = sc.textFile(
        "hdfs://localhost/user/cloudera/audi_case_study/data/")

    records.map(add_location_info) \
           .filter(lambda x: x != "-----") \
           .saveAsTextFile(outputPath)

    print("Number of good records: {}, Number of bad records: {}".format(
        acc_num_good_records.value, acc_num_bad_records.value))

Beispiel #51

0

Datei anzeigen

Datei: baseball_BLER.py Projekt: carriercomm/bellLabs

         f = inputFiles[i]
         if subFileCount==filePartitionSize:
              subDirs.append(subDir)
              subDirNum += 1
              subFileCount = 0
              subDir = str(subDirNum) + "/"
              os.makedirs(input_dir + subDir)
         shutil.move(input_dir + f,input_dir + subDir)
         subFileCount += 1
    if subFileCount==filePartitionSize:
         subDirs.append(subDir)

    sc = SparkContext("local[" + numCores + "]" , "job", pyFiles=[realpath('helper.py')])
    eNodeBLoadVec = []
    for bs in eNodeBs:
         v = sc.accumulator([(0,0,0)]*len(intervals), VectorAccumulatorParamTriple())
         eNodeBLoadVec.append(v)

    prev_idx = 0
    for i in range(len(subDirs)):
         d = subDirs[i]
         end_idx = intervals.index(dirTimeBoundaries[i])
         intervalBoundary = (prev_idx+1,end_idx) #both indexes are included
         prev_idx = end_idx

         bs2data = sc.textFile(input_dir + d + '*.gz').filter(filterData).map(generateBS2Data).reduceByKey(reduceBS2IMSI2Data)
         bs2data.foreach(getBearerLoad)
     
    resetDirectories(subDirs,input_dir) 

    header = "time "

Beispiel #52

0

Datei anzeigen

Datei: quac.py Projekt: kytle/QUAC

import sys,getopt
from math import *

#custom accumulator for boolean variable changevar
class VectorAccumulatorParam(AccumulatorParam):
    def zero(self, value):
        return False
    def addInPlace(self, val1, val2):
        return val1 or val2


sc = SparkContext(appName="quasicliqueEnumeration",serializer=MarshalSerializer())


#changevar check whether new clusters are formed and decides whether to go to the next iteration
changevar= sc.accumulator(bool, VectorAccumulatorParam())

gamma =0.9#default value of gamma

k=3   # default value of k-size of the clique after which gamma should be applied


#create each each edge into a clique
def createinitialClusters(input):
    nodes = input.split()
    edge = ()
    if len(nodes) > 1:
        if(nodes[1] > nodes[0]):
            edge = (int(nodes[0]),int(nodes[1]))
        else:
            edge = (int(nodes[1]),int(nodes[0]))

Beispiel #53

0

Datei anzeigen

Datei: bfs.py Projekt: realtwo/pyspark

#input data format:
# id0 id01 id02 ...
# id1 id11 id12 ...
#...
# first id (idn) is current node
# following ids (idnm) are nodes connected to idn.



from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("BFS")
sc = SparkContext(conf = conf)

hitCounter = sc.accumulator(0)

src_id = 1
dst_id = 6

def parseInput(line):
	l = line.split()
	v_id = int(l[0])
	
	if v_id == src_id:
		v_dist = 0
		v_status = 1
	else:
		v_dist = 9999
		v_status = 0

Beispiel #54

0

Datei anzeigen

import sys
sys.path.insert(0, '.')
from pyspark import SparkContext, SparkConf
from commons.Utils import Utils

if __name__ == "__main__":
    conf = SparkConf().setAppName('StackOverFlowSurvey').setMaster("local[*]")
    sc = SparkContext(conf = conf)
    total = sc.accumulator(0)
    missingSalaryMidPoint = sc.accumulator(0)
    responseRDD = sc.textFile("in/2016-stack-overflow-survey-responses.csv")

    def filterResponseFromCanada(response):
        splits = Utils.COMMA_DELIMITER.split(response)
        # update accumulator for each record
        total.add(1)
        if not splits[14]:
            missingSalaryMidPoint.add(1)
        return splits[2] == "Canada"

    responseFromCanada = responseRDD.filter(filterResponseFromCanada)
    print("Count of responses from Canada: {}".format(responseFromCanada.count()))
    print("Total count of responses: {}".format(total.value))
    print("Count of responses missing salary middle point: {}" \
        .format(missingSalaryMidPoint.value))

Beispiel #55

0

Datei anzeigen

Datei: acumuladores.py Projekt: andrs/libro

 #!/usr/bin/python
 # -*- coding: utf-8 -*- 

from pyspark import SparkContext
from pyspark.accumulators import AccumulatorParam

sc = SparkContext("local", "Simple App")

class MultiplicadorAccum(AccumulatorParam):
	def zero(self, initialValue):
		return 1
	def addInPlace(self, v1, v2):
		return v1*v2


acc = sc.accumulator(1,MultiplicadorAccum())
sc.parallelize([1,2,3,4,5,5,6,7,7]).foreach(lambda x: acc.add(x))

print("value %d " % acc.value)

Beispiel #56

0

Datei anzeigen

    lambda tokens: (int(tokens[0]), int(tokens[1]), int(tokens[2])))


# ----------------------------------- Load Accumulator Object ------------------------------------------------
class VectorAccumulatorParam(AccumulatorParam):
    def zero(self, value):
        return [0.0] * len(value)

    def addInPlace(self, val1, val2):  #val1: list
        val1 += val2
        return val1


# ----------------------------------- Process test RDD object ------------------------------------------------
def test_result(x):  # x[0]:RatingID, x[1]:user, x[2]:item
    global ans
    PredRating = getPredict(x[1], x[2])
    ans += [[x[0], PredRating]]


# ----------------------------------- Get predicted rating of Test.dat  ------------------------------------------------
ans = sc.accumulator([], VectorAccumulatorParam())
test_rdd.foreach(test_result)
ans = np.array(ans.value)
# print("RatingID ", ans[:, 0], "Rating", ans[:, 1])

# ----------------------------------- Output File  ------------------------------------------------
predictPdsDF = pd.DataFrame({'RatingID': ans[:, 0], 'Rating': ans[:, 1]})
predictPdsDF['RatingID'] = predictPdsDF['RatingID'].astype(int)
predictPdsDF.to_csv("predict.csv", index=False)

Beispiel #57

0

Datei anzeigen

Datei: transferToES.py Projekt: wikimedia/wikimedia-discovery-analytics

SOURCE = options.source
TARGET = options.url
NOOP_WITHIN = options.noop
FIELD = options.field
if options.hostmap[0:24] == 'hdfs://analytics-hadoop/':
    hostMap = json.loads(subprocess.check_output(["hdfs", "dfs", "-cat", options.hostmap[23:]]))
else:
    hostMap = json.load(open(options.hostmap))

print "Transferring from %s to %s" % (SOURCE, TARGET)

if __name__ == "__main__":
    sc = SparkContext(appName="Send To ES: %s" % (TARGET))
    sqlContext = SQLContext(sc)
    broardcastMap = sc.broadcast(hostMap)
    documentCounter = sc.accumulator(0)
    updateCounter = sc.accumulator(0)
    errorCounter = sc.accumulator(0)
    failedDocumentCounter = sc.accumulator(0)

    def documentData(document):
        """
        Create textual representation of the document data for one document
        """
        updateData = {"update": {"_id": document.page_id}}
        if NOOP_WITHIN:
            updateDoc = {"script": {
                "script": "super_detect_noop",
                "lang": "native",
                "params": {
                    "handlers": {FIELD: "within " + NOOP_WITHIN + "%"},

Beispiel #58

0

Datei anzeigen

Datei: ChapterSixExample.py Projekt: zxz53000/learning-spark-1

import json
import math
import os

from pyspark import SparkContext
from pyspark import SparkFiles

sparkMaster = sys.argv[1]
inputFile = sys.argv[2]
outputDir = sys.argv[3]

sc = SparkContext(sparkMaster, appName="ChapterSixExample")
file = sc.textFile(inputFile)

# Count lines with KK6JKQ using accumulators
count = sc.accumulator(0)


def incrementCounter(line):
    global count  # Access the counter
    if "KK6JKQ" in line:
        count += 1


file.foreach(incrementCounter)
print "Lines with KK6JKQ %d" % count.value

# Create Accumulator[Int] initialized to 0
blankLines = sc.accumulator(0)
dataLines = sc.accumulator(0)

Beispiel #59

0

Datei anzeigen

Datei: p2t.py Projekt: lyhong508/CC

    #     # iscount:
    #     idscore = iscount.map(lambda a: (a[0][0], 0.15 + 0.85 * (a[0][1] + danglinglist.value() / n)))

    lines = sc.textFile("s3n://mapreduceadam/data/wikipedia_arcs")

    n = lines.flatMap(lambda a: a.encode("utf-8").strip().split("\t")).distinct().count()

    # Loads all ids from input file and initialize their neighbors.
    links = lines.map(lambda a: (a.split("\t")[0], a.split("\t")[1])).distinct().groupByKey().cache()

    # Loads all ids with other id(s) to from input file and initialize ranks of them to one.
    ranks = links.map(lambda a: (str(a[0]), 1.0))

    # Calculates and updates id ranks continuously using PageRank algorithm.
    for i in range(10):
        danglinglist = sc.accumulator(0.0)
        outjoin = links.rightOuterJoin(ranks)
        contribs = outjoin.flatMap(lambda a: computeContribs(a)).reduceByKey(lambda a, b: a + b)
        print contribs.collect()
        danglingtotal = danglinglist.value
        # Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.map(lambda a: calltribs(a))

    # file2 = sc.textFile("s3n://s15-p42-part2/data/wikipedia_mapping")
    file2 = sc.textFile("s3n://mapreduceadam/data/wikipedia_mapping").encode("utf-8")
    namelist = file2.map(lambda a: (a.split("\t")[0].encode("utf-8"), a.split("\t")[1].encode("utf-8")))
    # output = namelist.join(ranks).map(lambda a: a[1][0] + "\t" + str(a[1][1])).saveAsTextFile("p2output")
    print namelist.join(ranks).map(lambda a: a[1][0] + "\t" + str(a[1][1])).collect()
    sc.stop()
    # counts = file.flatMap(lambda line: line.split(" ")) \
    #     .map(lambda word: (word, 1)) \

Beispiel #60

0

Datei anzeigen

Datei: a5.py Projekt: aravindparappil46/adv-data-science-on-ramp-projects


# Fetching a particular key from dict and changing its case
res = lookupAndSwapCase("key2")
print("============================================")
print("########## BROADCAST VARIABLE EXAMPLE #######")
print("Value at key2 is:", res)
print("Entire broadcast object is: ", data_broadcast)
print("============================================")
###
# Broadcast variable section ends
###

# Accumulator example begins
# Adds a 3 to final value
accu = sc.accumulator(3)


def accuFunction(arg):
    global accu
    accu += arg


rdd = sc.parallelize([1, 2, 3])  # Creates an RDD
rdd.foreach(accuFunction)  # Call the function for each rdd element

print("============================================")
print("########## ACCUMULATOR EXAMPLE #######")
print("Final accumulated value is: ", accu.value)
print("============================================")
# Accumulator example ends