Python SparkContext.binaryRecords Beispiele

Programmiersprache: Python

Namespace / Paketname: pyspark

Klasse / Typ: SparkContext

Methode / Funktion: binaryRecords

Beispiele auf hotexamples.com: 7

Python SparkContext.binaryRecords - 7 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die pyspark.SparkContext.binaryRecords, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

setLogLevel(30)

setSystemProperty(30)

setCheckpointDir(30)

getConf(30)

parallelize(30)

pickleFile(30)

broadcast(30)

emptyRDD(30)

newAPIHadoopFile(30)

binaryFiles(30)

addPyFile(30)

addFile(30)

accumulator(30)

getOrCreate(30)

SparkContext(30)

sequenceFile(30)

newAPIHadoopRDD(25)

_ensure_initialized(14)

createDataFrame(11)

hadoopFile(10)

show_profiles(9)

range(8)

dump_profiles(6)

mongoRDD(6)

binaryRecords(6)

map(4)

setLocalProperty(3)

runJob(3)

flatMap(2)

cassandraTable(2)

collect(2)

close(2)

setJobGroup(2)

paralellize(1)

neo4jTable(1)

neo4jConfig(1)

parallelise(1)

BSONFileRDD(1)

parallelized(1)

parallize(1)

reduceByKey(1)

sample(1)

mongoPairRDD(1)

setMaster(1)

show_profile(1)

sortBy(1)

saveAsTextFile(1)

hadoopConfiguration(1)

mixin(1)

filter(1)

Beispiel #1

Datei anzeigen

def main(argv=None):
    if argv is None:
        pass

    file_name = os.path.split(ZTEUMTS_IN_PATH)[1]
    zteumts_file = readFileGzip(ZTEUMTS_IN_PATH)
    chunk_size = 2048
    bytes_to_read = chunk_size
    bytes_already_read = 0
    i = 0
    k = 0

    conf = SparkConf()
    sc = SparkContext(conf=conf)

    zteumts_rdd = sc.binaryRecords(ZTEUMTS_IN_PATH, 2048)

    print "rdd count: " + str(zteumts_rdd.count())

    sys.tracebacklimit = 0
    start_time_for_iteration = time.time()

    file_size = get_file_size(ZTEUMTS_IN_PATH)

    print "file size:" + str(file_size)

    #cleaned_rdd = zteumts_rdd.map(process_zteumts_record_map())
    parsed_records = zteumts_rdd.map(process_zteumts_record_map)
    print "parsed things: " + str(parsed_records.count())

    print "records in one partition:" + len(one_mapped_partition)

    print "Time to iterate entire contents:" + str(time.time() -
                                                   start_time_for_iteration)

    print "Total record count is:" + str(i)
    print "Total chunks divided :" + str(k)

    zteumts_file.close()

Beispiel #2

Datei anzeigen

Datei: SCADET.py Projekt: sabbaghm/SCADET

    result_file.write(
        'Cache parameters (assoc, intra_group_threshold, line_bits, set_mask, set_bits, inter_group_threshold): '
        + str(CACHE_PARAMS) + '\n\n')

    if (scope == 'L1I'):
        # 1st Q: Program counter
        unpack_format = '1Q'  #one 64-bit values
        NUM_OF_ITEMS = 1  #Number of items per record (line) of the binary file
    else:
        # 1st Q: R/W | 2nd Q: Memory addresses
        unpack_format = '2Q'  #two 64-bit values
        NUM_OF_ITEMS = 2  #Number of items per record (line) of the binary file

    NUM_BYTES_PER_ITEM = 8  #Number of bytes per item
    recordLength = NUM_OF_ITEMS * NUM_BYTES_PER_ITEM  #Total number of bytes in each record (line)
    binary_rdd = sc.binaryRecords(args.input, recordLength)

    set_grouped_rdd = binary_rdd.map(
        lambda record: unpack_from(unpack_format, record)).zipWithIndex()
    set_grouped_rdd.cache

    ############################ *************************** L1D or LLC read analysis *************************** #########################################
    ########################################################################################################################################################
    ########################################################################################################################################################
    if (scope in ['L1D', 'LLC']):
        if (mode in ['READ', 'BOTH']):
            # mem access addresses are at the second column
            set_grouped_rdd_read = set_grouped_rdd.filter(lambda (items, index): items[0] == RD) \
            .map(lambda (items, index): (paddrToSetIdx({'addrs': items[1], 'params': CACHE_PARAMS}), \
            (index, paddrToTag({'addrs': items[1], 'params': CACHE_PARAMS})))).groupByKey().map(lambda x : (x[0], list(x[1])))
            # Create [(number of unique address lines in group0, average access time in group0), ...], filter by temporal locality in the line access times and number of the unique addresses

Beispiel #3

Datei anzeigen

Datei: simple_reducemap.py Projekt: cchriste/dataflow

    outfilename=basedir+"/reducemap_binary_output-"+str(idx).zfill(2)+".bin"
    outfile=open(outfilename,'w')
    for x in iterator:
        outfile.write(str(x[1].data))

if __name__ == "__main__":

    if len(sys.argv) != 3:
        print("Usage: simple_reducemap <fileA> <fileB>", file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="SimpleReduceMap")
#todo: https://spark.apache.org/docs/latest/configuration.html#spark-properties
#  conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

    linesA = sc.binaryRecords(sys.argv[1],24)
    A = linesA.mapPartitionsWithIndex(parseVectorFunctor(linesA.getNumPartitions()),True).cache()
    A.getStorageLevel()
    print(A.getStorageLevel())

    linesB = sc.binaryRecords(sys.argv[2],24)
    B = linesB.mapPartitionsWithIndex(parseVectorFunctor(linesB.getNumPartitions()),True).cache()

    C = A.union(B).cache()

    D = C.reduceByKey(dot_vec3).cache()
    print("numPartitions(%d,%s): %d"%(D.id(),D.name(),D.getNumPartitions()))
    #D.foreach(lambda v: print(str(v)))
    D.getStorageLevel()
    print(D.getStorageLevel())

Beispiel #4

Datei anzeigen

Datei: count-nums-bin.py Projekt: murer/sandbox

from pyspark import SparkConf, SparkContext
conf = (SparkConf()
         .setMaster("local[4]")
         .setAppName("MyApp")
         .set("spark.executor.memory", "1g")
         .set('spark.local.dir', './target/tmp'))
sc = SparkContext(conf = conf)

def test(a):
    print 'a', a

words = sc.binaryRecords("./gen/data/nums", 3)
words = words.map(test)
words.saveAsTextFile('./target/result3')

sc.stop()

Beispiel #5

Datei anzeigen

Datei: spark-sort.py Projekt: happen2me/cloud-sort

conf = SparkConf().setAppName(app_name)
sc = SparkContext()

# uncomment this block if run local
# aws_profile = 'default'
# config = configparser.ConfigParser()
# config.read(os.path.expanduser("~/.aws/credentials"))
# access_id = config.get(aws_profile, "aws_access_key_id")
# access_key = config.get(aws_profile, "aws_secret_access_key")
# hadoop_conf = sc._jsc.hadoopConfiguration()
# hadoop_conf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
# hadoop_conf.set("fs.s3n.awsAccessKeyId", access_id)
# hadoop_conf.set("fs.s3n.awsSecretAccessKey", access_key)

input_file = sc.binaryRecords(input_bucket + input_path, 100)
input_file = input_file.map(lambda line: (line[:10], line[10:100]))
input_file = input_file.partitionBy(10)

# logging.basicConfig(level=print)
print("input file patition num: {}".format(len(input_file.glom().collect())))
print("len first: {}".format(len(input_file.first())))
print("type first: {}".format(type(input_file.first())))
print("len first[0]: {}".format(len(input_file.first()[0])))
print("type first[0]: {}".format(type(input_file.first()[0])))
print("input count: " + str(input_file.count()))

start = time.time()
# input_file.map(lambda line: (line[:10], line[10:100]))\
sorted_op = input_file.sortByKey()\
    .map(lambda item: item[0]+item[1])

Beispiel #6

Datei anzeigen

    return lambda p: add_vec3(p,shift)
        
def savebin(iterator):
    basedir='/tmp'  #'/mnt'
    idx=len(glob(basedir+'/binary_output*.bin'))
    outfilename=basedir+"/binary_output-"+str(idx).zfill(2)+".bin"
    outfile=open(outfilename,'w')
    for x in iterator:
        outfile.write(x.data)

if __name__ == "__main__":

    if len(sys.argv) != 2:
        print("Usage: simple_map <file>", file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="SimpleMap")

    lines = sc.binaryRecords(sys.argv[1],24) #three doubles per vector (record)
    A = lines.map(parseVector) # is .cache() this causing unnecessary/redundant processing when memory is overflowed? At least for this case, cache isn't needed anyway. Basically, I'm afraid that calling cache with insufficient memory performs computation to create the cache entry, which then pushes previous entries out. In the end, when the data is actually used it needs to be computed again. As my pappy always told me, "be careful with cache!"

    print("numPartitions(%d,%s): %d"%(A.id(),A.name(),A.getNumPartitions()))

    shift=np.array([25.25,-12.125,6.333],dtype=np.float64)
    B = A.map(construct_apply_shift(shift))
    print("numPartitions(%d,%s): %d"%(B.id(),B.name(),B.getNumPartitions()))

    B.foreachPartition(savebin)

    sc.stop()

Beispiel #7

Datei anzeigen

Datei: bfs.py Projekt: alokparmesh/csep524

from __future__ import print_function

import struct
import sys
from pyspark import SparkContext

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: matrixVectorMultiply <inputfile> <root>",
              file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="bfs.py")
    bFile = sc.binaryRecords(sys.argv[1], 16)
    root = int(sys.argv[2])

    # Read all edge inputs, reverse edges and union as graph is undirected
    edgeInputs = bFile.map(lambda x: struct.unpack("<qq", x))
    allEdgeList = sc.union(
        [edgeInputs, edgeInputs.map(lambda x: (x[1], x[0]))])
    totalEdgeCount = allEdgeList.count()

    # Reduce edge list to tuple of vertex and array of child vertices
    inputGraph = allEdgeList.map(
        lambda edge: (edge[0], [edge[1]])).reduceByKey(lambda a, b: a + b)
    inputGraph.cache()
    totalVertexCount = inputGraph.count()

    distances = inputGraph.map(lambda x: (x[0], -1))
    distances.cache()
    currentLevel = 0
    currentLevelQueue = sc.parallelize([(root, currentLevel)])