Ejemplo n.º 1
0
class Orchestrator:
    def __init__(self, target_bucket, target_fname, upload=False):
        self.target_fname = target_fname
        self.target_bucket = target_bucket
        self.ini_error = False
        format_str = "cloudfunctions:\n  'endpoint': ''\n  'namespace': ''\n  'api_key': ''\nrabbitamqp:\n  'url': ''\ncos:\n  service_endpoint: ''\n  secret_key: ''\n  access_key: ''"

        try:
            # load keys securely
            with open('secret.yaml', 'r') as f:
                secret = yaml.safe_load(f)

            # initialitze the remote storage wrapper, and upload the target file
            self.cb = COSBackend(secret['cos']['service_endpoint'],
                                 secret['cos']['secret_key'],
                                 secret['cos']['access_key'])
            if upload:
                target_file = open(self.target_fname, "rb")
                self.cb.put_object(target_bucket, target_fname,
                                   target_file.read())
                target_file.close()

            # retrieve file length, ensure file has been uploaded
            try:
                self.fsize = int(
                    self.cb.head_object(self.target_bucket,
                                        self.target_fname)['content-length'])
            except:
                print(
                    'File \'{}\' was not found in this bucket \'{}\'. Upload it and retry.'
                    .format(self.target_fname, self.target_bucket))
                self.ini_error = True
                return None

            # initialize the function wrapper
            config = {}
            config['endpoint'] = secret['cloudfunctions']['endpoint']
            config['namespace'] = secret['cloudfunctions']['namespace']
            config['api_key'] = secret['cloudfunctions']['api_key']
            self.cf = CloudFunctions(config)

            # initialize the queue system
            self.pika_params = pika.URLParameters(secret['rabbitamqp']['url'])

        except KeyError:
            print('Wrong yaml document format. Please use the following one:')
            print(format_str)
            self.ini_error = True
        except FileNotFoundError as e:
            print('File \'{}\' not found.'.format(e.filename))
            self.ini_error = True

        # set the common args stub
        self.comargs = {}
        self.comargs['cos'] = secret['cos']
        self.comargs['rabbitamqp_url'] = secret['rabbitamqp']['url']
        self.comargs['target_bucket'] = self.target_bucket
        self.comargs['target_fname'] = self.target_fname

        # two separate queues, the reducer waits for the mappers and the orchestrator waits for the reducer
        self.mapper_qid = 'mapperQueue'
        self.reducer_qid = 'reducerQueue'

    def run(self, mapper, nthreads):
        # check if initialization was good
        if self.ini_error:
            return -4
        # validation of parameters
        if nthreads < 1:
            print(
                'Minimum number of partitions or threads must be 1. \nExiting...'
            )
            return -1
        if mapper != 'CountingWords' and mapper != 'WordCount':
            print(
                '{} is not supported as a mapper yet. Supported mappers: CountingWords, WordCount. \nExiting...'
                .format(mapper))
            return -2

        # prepare arguments for the mapper (mapper args)
        chunk_size = int(self.fsize / nthreads)
        mapargs = self.comargs.copy()
        mapargs['qid'] = self.mapper_qid

        # stat connection with the queue system
        connection = pika.BlockingConnection(self.pika_params)
        channel = connection.channel()
        channel.queue_declare(queue=self.mapper_qid)
        channel.queue_purge(
            queue=self.mapper_qid)  # ensure no message was left

        # measure time
        start_t = time.time()

        # dispatch mappers except the last one
        for i in range(0, nthreads - 1):
            mapargs['index'] = str(i)
            mapargs['Range'] = 'bytes={}-{}'.format(chunk_size * i,
                                                    chunk_size * (i + 1))
            self.cf.invoke(mapper, mapargs)
            #print('[{}]'.format(mapargs['index']), chunk_size*i, 'to', chunk_size*(i+1))

        # dispatch the last mapper, so that it takes the rest of the file
        mapargs['index'] = nthreads - 1
        mapargs['Range'] = 'bytes={}-{}'.format(chunk_size * (nthreads - 1),
                                                self.fsize)
        self.cf.invoke(mapper, mapargs)
        #print('[{}]'.format(mapargs['index']), chunk_size*(nthreads-1), 'to', self.fsize)

        # prepare arguments for the reducer (reducer args)
        redargs = self.comargs.copy()
        redargs['reduce_{}'.format(mapper)] = 'yes'
        redargs['nthreads'] = nthreads
        redargs['mapper_qid'] = self.mapper_qid
        redargs['reducer_qid'] = self.reducer_qid

        channel.queue_declare(queue=self.reducer_qid)
        channel.queue_purge(
            queue=self.reducer_qid)  # ensure no message was left

        self.cf.invoke('Reducer', redargs)

        # wait for the reducer to finish
        channel.basic_consume(queue=self.reducer_qid,
                              on_message_callback=SingleCallback())
        channel.start_consuming()

        # measure time
        end_t = time.time()

        connection.close()

        print('Done.\nExecution time: {0:.5g}s'.format(end_t - start_t))

    def claimFile(self, result_type, result_fname):
        # check if initialization was good
        if self.ini_error:
            return -4

        try:
            result_file = open(result_fname, "w")
            cos_result = self.cb.get_object(
                self.target_bucket,
                '{}/{}-result'.format(self.target_fname, result_type))
            result_file.write(cos_result.decode('utf-8'))
            result_file.close()
        except:
            print(
                'Something went wrong, could not download result file for: {}, action: {}'
                .format(self.target_fname, result_type))
Ejemplo n.º 2
0
    })


if __name__ == '__main__':
    #read the arguments given
    fileName = sys.argv[1]
    nFunctions = 1
    #load config file
    with open('../ibm_cloud_config', 'r') as config_file:
        res = yaml.safe_load(config_file)

    #configure COS library
    odb = COSBackend(res['ibm_cos'])

    fileSize = int(
        odb.head_object(res['ibm_cos']["bucket"], fileName)["content-length"])
    #check if there are enough workers

    while (
            fileSize / nFunctions
    ) > 110000000:  #this number is an aproximation i don't know the limit
        nFunctions += 1

    for i in (1, 2, 3, 4, 5, 7, 10, 12, 15, 20, 25, 30, 35, 40, 45, 50):
        #invoke functions calculating the time
        i *= nFunctions
        timeWordCount = 0
        timeCountingWords = 0
        for _ in range(0, 4):  #average
            start = time.time()
            invokeFunctions('wordCount', i, fileSize, fileName, res)
if len(sys.argv) != 3:
    print(
        "Han d'haver dos parametres: El fitxer a analitzar i el numero de particions"
    )
    sys.exit()

bucket_name = input("Introdueixi el bucket name del IBM COS\n")
program = int(
    input("Seleccioni programa:\n1. Counting Words\n2. Word Count\n"))
if (program == 1 or program == 2):
    cos_backend = COSBackend(res.get('ibm_cos'))
    ibm_cf = CloudFunctions(res['ibm_cf'])
    file = sys.argv[1]
    file_size = int(
        cos_backend.head_object(bucket_name, file).get('content-length'))
    partition_size = file_size / int(sys.argv[2])
    params = {
        'program': program,
        'file_name': file,
        'cos_params': res.get('ibm_cos'),
        'bucket_name': bucket_name
    }
    loop = asyncio.get_event_loop()
    tasks = []
    initial_time = datetime.now()

    for i in range(int(sys.argv[2])):
        params['num_partition'] = i
        params['space'] = (i * partition_size, (i + 1) * partition_size)
        tasks.append(loop.create_task(perform_cloud('map', params.copy())))