Esempio n. 1
0
def invokeFunctions(function, nFunctions, fileSize, fileName, res):
    bottomRange = 0
    cf = CloudFunctions(res['ibm_cf'])

    #range(0, nFunctions-1) since the last one needs to be invoked different
    for i in range(0, nFunctions - 1):
        topRange = int(fileSize / nFunctions) + bottomRange
        #invoke the function with all its parameters
        #param -> res, filemame, bottom and top range, fileNumber
        cf.invoke(
            function, {
                "res": res,
                "fileName": fileName,
                "topRange": str(topRange),
                "bottomRange": str(bottomRange),
                "functionNumber": str(i)
            })
        bottomRange = topRange

    #same as the previous ones but topRange is replaced for fileSize
    cf.invoke(
        function, {
            "res": res,
            "fileName": fileName,
            "topRange": str(fileSize),
            "bottomRange": str(bottomRange),
            "functionNumber": str(-1)
        })

    #invoke the reduce function(recieves messages from the other ones)
    #invoke with result as we want the time needed to finish.
    #  param -> res, nIterations, fileName
    _ = cf.invoke_with_result("reduce", {
        "res": res,
        "iter": nFunctions,
        "fileName": fileName
    })
Esempio n. 2
0
def main(filename, nchunks, op):
    #Time measurement begin (for testing)
    i_time=datetime.now()
    
    #Checking parameters
    if op not in ['count', 'diffcount']:
        print("Valid operators are count (total number of words) or diffcount (counts amounts of each individual word)")
        exit(1)
    if nchunks < 1:
        print("Number of parallel processes must be a positive integer!")
        exit(1)
        
    #Reading config files and preparing the function backend
    configIBMCloud=readParamsCloudF()
    configCOS= readParamsCOS()
    functionbackend = CloudFunctions(configIBMCloud)

    #Preparing dictionaries to serve as function arguments
    argsOriginal = {}
    argsOriginal['filename']=filename
    argsOriginal['configCOS']= configCOS
    argsOriginal['bucket'] = 'originals'
    
    argsTemp = {}
    argsTemp['configCOS']= configCOS
    argsTemp['targetBucket'] = 'temps1'
    argsTemp['sourceBucket'] = 'originals' 
    argsTemp['sourceFile'] = filename
    argsTemp['op']=op

    #Reading the header of the origin file in COS (located in bucket 'originals') and calculating chunk size
    headers=functionbackend.invoke_with_result('GetHead', argsOriginal)
    if 'error' in headers.keys():
        print('File does not exist in target COS. Exiting program')
        exit(1)
    sizeFile = int(headers.get('content-length'))

    #Begin chunking
    print('All parameters obtained. Beginning chunking')
    currentByte=0
    chunkCounter =0 
    chunkSize=(sizeFile//nchunks)+1
       
    while currentByte<sizeFile:
        argsTemp['startbyte']=currentByte
        
        if (currentByte+chunkSize)>sizeFile:
            argsTemp['endbyte']=sizeFile
        else:
            argsTemp['endbyte']=currentByte+chunkSize
        
        argsTemp['targetFile']='temp'+str(chunkCounter)
        #mapper.main(argsTemp)  #discomment this for linear mapping (requires mapper.py)
        functionbackend.invoke('Map', argsTemp)
        currentByte=currentByte+chunkSize+1
        chunkCounter+=1
    
    #Wait until all mappers have finished (all temp files created)
    print('All chunks sent. Waiting for results...')
    ##listObjects=cosbackend.list_objects('temp1')
    listObjects= (functionbackend.invoke_with_result('ListObjects', { 'bucket': 'temps1' , 'configCOS': configCOS})).get('files')
    timeout=0
    while(len(listObjects)<nchunks):
        time.sleep(0.1)
        listObjects= (functionbackend.invoke_with_result('ListObjects', { 'bucket': 'temps1' , 'configCOS': configCOS})).get('files')
        timeout+=1
        if (timeout>600):
            #Note: We discovered that with files that had specific complex UTF characters, there's a chance that the split will accidentally break up multi-byte
            #characters between two mappers, leaving both mappers confused and unable to return valid results. Therefore, the program assumes
            #that if a full minute goes by without changes, the mappers are probably stuck and the execution has failed
            print('Program timed out - Mapper found an issue. Please check the encoding in your file!')
            print('Please wait while we clean up temp files...')
            for elem in listObjects: functionbackend.invoke('Delete', {'bucket':'temps1', 'configCOS':configCOS, 'filename':elem})
            print('All files cleaned up. Exiting program.')
            exit(1)
        
    #Begin reduction
    print('Mapping complete. Proceed to reduction')
    #result=reduce.main({'bucket':'temps1', 'configCOS':configCOS, 'op':op, 'prefix':'temp'})
    functionbackend.invoke_with_result('Reducer', {'bucket':'temps1', 'configCOS':configCOS, 'op':op, 'prefix':'temp'})
    result=(functionbackend.invoke_with_result('GetObject', {'bucket':'temps1', 'configCOS':configCOS, 'filename':'resultados'})).get('content')
    result=json.loads(result)

    #time measurement end and printing (for reference)
    f_time=datetime.now()
    print('Total elapsed time='+str(f_time-i_time))
    
    #cleanup
    functionbackend.invoke('Delete', {'bucket':'temps1', 'configCOS':configCOS, 'filename':'resultados'})
    
    return result
Esempio n. 3
0
import sys

from cos_backend import COS_Backend
from ibm_cf_connector import CloudFunctions
import yaml
import time

if __name__ == '__main__':
    with open('ibm_cloud_config.txt', 'r') as config_file:
        res = yaml.safe_load(config_file)

    ibm_cos = res['ibm_cos']
    ibm_cf = res['ibm_cf']
    cos = COS_Backend(ibm_cos)
    cloud = CloudFunctions(ibm_cf)

    fileread = sys.argv[1]
    partitions = int(sys.argv[2])

    #Add required files to the cloud
    f = open(fileread, 'r', encoding='utf-8', errors='ignore')
    cos.put_object('sdprac1', fileread, f.read())

    f = open('wordcount.zip', 'rb')
    cloud.create_action('wordcount', f.read())

    f = open('reduce.zip', 'rb')
    cloud.create_action('reduce', f.read())

    f = open('countwords.zip', 'rb')
    cloud.create_action('countwords', f.read())
Esempio n. 4
0
    def __init__(self, target_bucket, target_fname, upload=False):
        self.target_fname = target_fname
        self.target_bucket = target_bucket
        self.ini_error = False
        format_str = "cloudfunctions:\n  'endpoint': ''\n  'namespace': ''\n  'api_key': ''\nrabbitamqp:\n  'url': ''\ncos:\n  service_endpoint: ''\n  secret_key: ''\n  access_key: ''"

        try:
            # load keys securely
            with open('secret.yaml', 'r') as f:
                secret = yaml.safe_load(f)

            # initialitze the remote storage wrapper, and upload the target file
            self.cb = COSBackend(secret['cos']['service_endpoint'],
                                 secret['cos']['secret_key'],
                                 secret['cos']['access_key'])
            if upload:
                target_file = open(self.target_fname, "rb")
                self.cb.put_object(target_bucket, target_fname,
                                   target_file.read())
                target_file.close()

            # retrieve file length, ensure file has been uploaded
            try:
                self.fsize = int(
                    self.cb.head_object(self.target_bucket,
                                        self.target_fname)['content-length'])
            except:
                print(
                    'File \'{}\' was not found in this bucket \'{}\'. Upload it and retry.'
                    .format(self.target_fname, self.target_bucket))
                self.ini_error = True
                return None

            # initialize the function wrapper
            config = {}
            config['endpoint'] = secret['cloudfunctions']['endpoint']
            config['namespace'] = secret['cloudfunctions']['namespace']
            config['api_key'] = secret['cloudfunctions']['api_key']
            self.cf = CloudFunctions(config)

            # initialize the queue system
            self.pika_params = pika.URLParameters(secret['rabbitamqp']['url'])

        except KeyError:
            print('Wrong yaml document format. Please use the following one:')
            print(format_str)
            self.ini_error = True
        except FileNotFoundError as e:
            print('File \'{}\' not found.'.format(e.filename))
            self.ini_error = True

        # set the common args stub
        self.comargs = {}
        self.comargs['cos'] = secret['cos']
        self.comargs['rabbitamqp_url'] = secret['rabbitamqp']['url']
        self.comargs['target_bucket'] = self.target_bucket
        self.comargs['target_fname'] = self.target_fname

        # two separate queues, the reducer waits for the mappers and the orchestrator waits for the reducer
        self.mapper_qid = 'mapperQueue'
        self.reducer_qid = 'reducerQueue'
Esempio n. 5
0
class Orchestrator:
    def __init__(self, target_bucket, target_fname, upload=False):
        self.target_fname = target_fname
        self.target_bucket = target_bucket
        self.ini_error = False
        format_str = "cloudfunctions:\n  'endpoint': ''\n  'namespace': ''\n  'api_key': ''\nrabbitamqp:\n  'url': ''\ncos:\n  service_endpoint: ''\n  secret_key: ''\n  access_key: ''"

        try:
            # load keys securely
            with open('secret.yaml', 'r') as f:
                secret = yaml.safe_load(f)

            # initialitze the remote storage wrapper, and upload the target file
            self.cb = COSBackend(secret['cos']['service_endpoint'],
                                 secret['cos']['secret_key'],
                                 secret['cos']['access_key'])
            if upload:
                target_file = open(self.target_fname, "rb")
                self.cb.put_object(target_bucket, target_fname,
                                   target_file.read())
                target_file.close()

            # retrieve file length, ensure file has been uploaded
            try:
                self.fsize = int(
                    self.cb.head_object(self.target_bucket,
                                        self.target_fname)['content-length'])
            except:
                print(
                    'File \'{}\' was not found in this bucket \'{}\'. Upload it and retry.'
                    .format(self.target_fname, self.target_bucket))
                self.ini_error = True
                return None

            # initialize the function wrapper
            config = {}
            config['endpoint'] = secret['cloudfunctions']['endpoint']
            config['namespace'] = secret['cloudfunctions']['namespace']
            config['api_key'] = secret['cloudfunctions']['api_key']
            self.cf = CloudFunctions(config)

            # initialize the queue system
            self.pika_params = pika.URLParameters(secret['rabbitamqp']['url'])

        except KeyError:
            print('Wrong yaml document format. Please use the following one:')
            print(format_str)
            self.ini_error = True
        except FileNotFoundError as e:
            print('File \'{}\' not found.'.format(e.filename))
            self.ini_error = True

        # set the common args stub
        self.comargs = {}
        self.comargs['cos'] = secret['cos']
        self.comargs['rabbitamqp_url'] = secret['rabbitamqp']['url']
        self.comargs['target_bucket'] = self.target_bucket
        self.comargs['target_fname'] = self.target_fname

        # two separate queues, the reducer waits for the mappers and the orchestrator waits for the reducer
        self.mapper_qid = 'mapperQueue'
        self.reducer_qid = 'reducerQueue'

    def run(self, mapper, nthreads):
        # check if initialization was good
        if self.ini_error:
            return -4
        # validation of parameters
        if nthreads < 1:
            print(
                'Minimum number of partitions or threads must be 1. \nExiting...'
            )
            return -1
        if mapper != 'CountingWords' and mapper != 'WordCount':
            print(
                '{} is not supported as a mapper yet. Supported mappers: CountingWords, WordCount. \nExiting...'
                .format(mapper))
            return -2

        # prepare arguments for the mapper (mapper args)
        chunk_size = int(self.fsize / nthreads)
        mapargs = self.comargs.copy()
        mapargs['qid'] = self.mapper_qid

        # stat connection with the queue system
        connection = pika.BlockingConnection(self.pika_params)
        channel = connection.channel()
        channel.queue_declare(queue=self.mapper_qid)
        channel.queue_purge(
            queue=self.mapper_qid)  # ensure no message was left

        # measure time
        start_t = time.time()

        # dispatch mappers except the last one
        for i in range(0, nthreads - 1):
            mapargs['index'] = str(i)
            mapargs['Range'] = 'bytes={}-{}'.format(chunk_size * i,
                                                    chunk_size * (i + 1))
            self.cf.invoke(mapper, mapargs)
            #print('[{}]'.format(mapargs['index']), chunk_size*i, 'to', chunk_size*(i+1))

        # dispatch the last mapper, so that it takes the rest of the file
        mapargs['index'] = nthreads - 1
        mapargs['Range'] = 'bytes={}-{}'.format(chunk_size * (nthreads - 1),
                                                self.fsize)
        self.cf.invoke(mapper, mapargs)
        #print('[{}]'.format(mapargs['index']), chunk_size*(nthreads-1), 'to', self.fsize)

        # prepare arguments for the reducer (reducer args)
        redargs = self.comargs.copy()
        redargs['reduce_{}'.format(mapper)] = 'yes'
        redargs['nthreads'] = nthreads
        redargs['mapper_qid'] = self.mapper_qid
        redargs['reducer_qid'] = self.reducer_qid

        channel.queue_declare(queue=self.reducer_qid)
        channel.queue_purge(
            queue=self.reducer_qid)  # ensure no message was left

        self.cf.invoke('Reducer', redargs)

        # wait for the reducer to finish
        channel.basic_consume(queue=self.reducer_qid,
                              on_message_callback=SingleCallback())
        channel.start_consuming()

        # measure time
        end_t = time.time()

        connection.close()

        print('Done.\nExecution time: {0:.5g}s'.format(end_t - start_t))

    def claimFile(self, result_type, result_fname):
        # check if initialization was good
        if self.ini_error:
            return -4

        try:
            result_file = open(result_fname, "w")
            cos_result = self.cb.get_object(
                self.target_bucket,
                '{}/{}-result'.format(self.target_fname, result_type))
            result_file.write(cos_result.decode('utf-8'))
            result_file.close()
        except:
            print(
                'Something went wrong, could not download result file for: {}, action: {}'
                .format(self.target_fname, result_type))
Esempio n. 6
0
import shutil, yaml, os
from ibm_cf_connector import CloudFunctions

if __name__ == '__main__':
    #create zip files
    shutil.make_archive('reduce', 'zip', 'Reduce')
    shutil.make_archive('countingWords', 'zip', 'CountingWords')
    shutil.make_archive('wordCount', 'zip', 'WordCount')

    #load ibm_cloud_config file
    with open('ibm_cloud_config', 'r') as config_file:
        res = yaml.safe_load(config_file)
    #configure cloud function library
    cf = CloudFunctions(res['ibm_cf'])

    #create new actions, they are updated in case of being created already
    f = open('reduce.zip', 'rb')
    cf.create_action('reduce', f.read(), kind='python:3.7')

    f = open('wordCount.zip', 'rb')
    cf.create_action('wordCount', f.read(), kind='python:3.7')

    f = open('countingWords.zip', 'rb')
    cf.create_action('countingWords', f.read(), kind='python:3.7')

    #delete zip files
    os.remove('reduce.zip')
    os.remove('countingWords.zip')
    os.remove('wordCount.zip')
with open('ibm_cloud_config', 'r') as config_file:
    res = yaml.safe_load(config_file)

if len(sys.argv) != 3:
    print(
        "Han d'haver dos parametres: El fitxer a analitzar i el numero de particions"
    )
    sys.exit()

bucket_name = input("Introdueixi el bucket name del IBM COS\n")
program = int(
    input("Seleccioni programa:\n1. Counting Words\n2. Word Count\n"))
if (program == 1 or program == 2):
    cos_backend = COSBackend(res.get('ibm_cos'))
    ibm_cf = CloudFunctions(res['ibm_cf'])
    file = sys.argv[1]
    file_size = int(
        cos_backend.head_object(bucket_name, file).get('content-length'))
    partition_size = file_size / int(sys.argv[2])
    params = {
        'program': program,
        'file_name': file,
        'cos_params': res.get('ibm_cos'),
        'bucket_name': bucket_name
    }
    loop = asyncio.get_event_loop()
    tasks = []
    initial_time = datetime.now()

    for i in range(int(sys.argv[2])):