Beispiel #1
0
def main(argv=None):
    if argv is None:
        pass

    file_name = os.path.split(ZTEUMTS_IN_PATH)[1]
    zteumts_file = readFileGzip(ZTEUMTS_IN_PATH)
    chunk_size = 2048
    bytes_to_read = chunk_size
    bytes_already_read = 0
    i = 0
    k = 0

    conf = SparkConf()
    sc = SparkContext(conf=conf)

    zteumts_rdd = sc.binaryRecords(ZTEUMTS_IN_PATH, 2048)

    print "rdd count: " + str(zteumts_rdd.count())

    sys.tracebacklimit = 0
    start_time_for_iteration = time.time()

    file_size = get_file_size(ZTEUMTS_IN_PATH)

    print "file size:" + str(file_size)

    #cleaned_rdd = zteumts_rdd.map(process_zteumts_record_map())
    parsed_records = zteumts_rdd.map(process_zteumts_record_map)
    print "parsed things: " + str(parsed_records.count())

    print "records in one partition:" + len(one_mapped_partition)

    print "Time to iterate entire contents:" + str(time.time() -
                                                   start_time_for_iteration)

    print "Total record count is:" + str(i)
    print "Total chunks divided :" + str(k)

    zteumts_file.close()
Beispiel #2
0
    result_file.write(
        'Cache parameters (assoc, intra_group_threshold, line_bits, set_mask, set_bits, inter_group_threshold): '
        + str(CACHE_PARAMS) + '\n\n')

    if (scope == 'L1I'):
        # 1st Q: Program counter
        unpack_format = '1Q'  #one 64-bit values
        NUM_OF_ITEMS = 1  #Number of items per record (line) of the binary file
    else:
        # 1st Q: R/W | 2nd Q: Memory addresses
        unpack_format = '2Q'  #two 64-bit values
        NUM_OF_ITEMS = 2  #Number of items per record (line) of the binary file

    NUM_BYTES_PER_ITEM = 8  #Number of bytes per item
    recordLength = NUM_OF_ITEMS * NUM_BYTES_PER_ITEM  #Total number of bytes in each record (line)
    binary_rdd = sc.binaryRecords(args.input, recordLength)

    set_grouped_rdd = binary_rdd.map(
        lambda record: unpack_from(unpack_format, record)).zipWithIndex()
    set_grouped_rdd.cache

    ############################ *************************** L1D or LLC read analysis *************************** #########################################
    ########################################################################################################################################################
    ########################################################################################################################################################
    if (scope in ['L1D', 'LLC']):
        if (mode in ['READ', 'BOTH']):
            # mem access addresses are at the second column
            set_grouped_rdd_read = set_grouped_rdd.filter(lambda (items, index): items[0] == RD) \
            .map(lambda (items, index): (paddrToSetIdx({'addrs': items[1], 'params': CACHE_PARAMS}), \
            (index, paddrToTag({'addrs': items[1], 'params': CACHE_PARAMS})))).groupByKey().map(lambda x : (x[0], list(x[1])))
            # Create [(number of unique address lines in group0, average access time in group0), ...], filter by temporal locality in the line access times and number of the unique addresses
Beispiel #3
0
    outfilename=basedir+"/reducemap_binary_output-"+str(idx).zfill(2)+".bin"
    outfile=open(outfilename,'w')
    for x in iterator:
        outfile.write(str(x[1].data))

if __name__ == "__main__":

    if len(sys.argv) != 3:
        print("Usage: simple_reducemap <fileA> <fileB>", file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="SimpleReduceMap")
#todo: https://spark.apache.org/docs/latest/configuration.html#spark-properties
#  conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

    linesA = sc.binaryRecords(sys.argv[1],24)
    A = linesA.mapPartitionsWithIndex(parseVectorFunctor(linesA.getNumPartitions()),True).cache()
    A.getStorageLevel()
    print(A.getStorageLevel())

    linesB = sc.binaryRecords(sys.argv[2],24)
    B = linesB.mapPartitionsWithIndex(parseVectorFunctor(linesB.getNumPartitions()),True).cache()

    C = A.union(B).cache()

    D = C.reduceByKey(dot_vec3).cache()
    print("numPartitions(%d,%s): %d"%(D.id(),D.name(),D.getNumPartitions()))
    #D.foreach(lambda v: print(str(v)))
    D.getStorageLevel()
    print(D.getStorageLevel())
Beispiel #4
0
from pyspark import SparkConf, SparkContext
conf = (SparkConf()
         .setMaster("local[4]")
         .setAppName("MyApp")
         .set("spark.executor.memory", "1g")
         .set('spark.local.dir', './target/tmp'))
sc = SparkContext(conf = conf)

def test(a):
    print 'a', a

words = sc.binaryRecords("./gen/data/nums", 3)
words = words.map(test)
words.saveAsTextFile('./target/result3')

sc.stop()
Beispiel #5
0
conf = SparkConf().setAppName(app_name)
sc = SparkContext()

# uncomment this block if run local
# aws_profile = 'default'
# config = configparser.ConfigParser()
# config.read(os.path.expanduser("~/.aws/credentials"))
# access_id = config.get(aws_profile, "aws_access_key_id")
# access_key = config.get(aws_profile, "aws_secret_access_key")
# hadoop_conf = sc._jsc.hadoopConfiguration()
# hadoop_conf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
# hadoop_conf.set("fs.s3n.awsAccessKeyId", access_id)
# hadoop_conf.set("fs.s3n.awsSecretAccessKey", access_key)

input_file = sc.binaryRecords(input_bucket + input_path, 100)
input_file = input_file.map(lambda line: (line[:10], line[10:100]))
input_file = input_file.partitionBy(10)

# logging.basicConfig(level=print)
print("input file patition num: {}".format(len(input_file.glom().collect())))
print("len first: {}".format(len(input_file.first())))
print("type first: {}".format(type(input_file.first())))
print("len first[0]: {}".format(len(input_file.first()[0])))
print("type first[0]: {}".format(type(input_file.first()[0])))
print("input count: " + str(input_file.count()))

start = time.time()
# input_file.map(lambda line: (line[:10], line[10:100]))\
sorted_op = input_file.sortByKey()\
    .map(lambda item: item[0]+item[1])
Beispiel #6
0
    return lambda p: add_vec3(p,shift)
        
def savebin(iterator):
    basedir='/tmp'  #'/mnt'
    idx=len(glob(basedir+'/binary_output*.bin'))
    outfilename=basedir+"/binary_output-"+str(idx).zfill(2)+".bin"
    outfile=open(outfilename,'w')
    for x in iterator:
        outfile.write(x.data)

if __name__ == "__main__":

    if len(sys.argv) != 2:
        print("Usage: simple_map <file>", file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="SimpleMap")

    lines = sc.binaryRecords(sys.argv[1],24) #three doubles per vector (record)
    A = lines.map(parseVector) # is .cache() this causing unnecessary/redundant processing when memory is overflowed? At least for this case, cache isn't needed anyway. Basically, I'm afraid that calling cache with insufficient memory performs computation to create the cache entry, which then pushes previous entries out. In the end, when the data is actually used it needs to be computed again. As my pappy always told me, "be careful with cache!"

    print("numPartitions(%d,%s): %d"%(A.id(),A.name(),A.getNumPartitions()))

    shift=np.array([25.25,-12.125,6.333],dtype=np.float64)
    B = A.map(construct_apply_shift(shift))
    print("numPartitions(%d,%s): %d"%(B.id(),B.name(),B.getNumPartitions()))

    B.foreachPartition(savebin)

    sc.stop()
Beispiel #7
0
from __future__ import print_function

import struct
import sys
from pyspark import SparkContext

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: matrixVectorMultiply <inputfile> <root>",
              file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="bfs.py")
    bFile = sc.binaryRecords(sys.argv[1], 16)
    root = int(sys.argv[2])

    # Read all edge inputs, reverse edges and union as graph is undirected
    edgeInputs = bFile.map(lambda x: struct.unpack("<qq", x))
    allEdgeList = sc.union(
        [edgeInputs, edgeInputs.map(lambda x: (x[1], x[0]))])
    totalEdgeCount = allEdgeList.count()

    # Reduce edge list to tuple of vertex and array of child vertices
    inputGraph = allEdgeList.map(
        lambda edge: (edge[0], [edge[1]])).reduceByKey(lambda a, b: a + b)
    inputGraph.cache()
    totalVertexCount = inputGraph.count()

    distances = inputGraph.map(lambda x: (x[0], -1))
    distances.cache()
    currentLevel = 0
    currentLevelQueue = sc.parallelize([(root, currentLevel)])