def invokeFunctions(function, nFunctions, fileSize, fileName, res): bottomRange = 0 cf = CloudFunctions(res['ibm_cf']) #range(0, nFunctions-1) since the last one needs to be invoked different for i in range(0, nFunctions - 1): topRange = int(fileSize / nFunctions) + bottomRange #invoke the function with all its parameters #param -> res, filemame, bottom and top range, fileNumber cf.invoke( function, { "res": res, "fileName": fileName, "topRange": str(topRange), "bottomRange": str(bottomRange), "functionNumber": str(i) }) bottomRange = topRange #same as the previous ones but topRange is replaced for fileSize cf.invoke( function, { "res": res, "fileName": fileName, "topRange": str(fileSize), "bottomRange": str(bottomRange), "functionNumber": str(-1) }) #invoke the reduce function(recieves messages from the other ones) #invoke with result as we want the time needed to finish. # param -> res, nIterations, fileName _ = cf.invoke_with_result("reduce", { "res": res, "iter": nFunctions, "fileName": fileName })
def main(filename, nchunks, op): #Time measurement begin (for testing) i_time=datetime.now() #Checking parameters if op not in ['count', 'diffcount']: print("Valid operators are count (total number of words) or diffcount (counts amounts of each individual word)") exit(1) if nchunks < 1: print("Number of parallel processes must be a positive integer!") exit(1) #Reading config files and preparing the function backend configIBMCloud=readParamsCloudF() configCOS= readParamsCOS() functionbackend = CloudFunctions(configIBMCloud) #Preparing dictionaries to serve as function arguments argsOriginal = {} argsOriginal['filename']=filename argsOriginal['configCOS']= configCOS argsOriginal['bucket'] = 'originals' argsTemp = {} argsTemp['configCOS']= configCOS argsTemp['targetBucket'] = 'temps1' argsTemp['sourceBucket'] = 'originals' argsTemp['sourceFile'] = filename argsTemp['op']=op #Reading the header of the origin file in COS (located in bucket 'originals') and calculating chunk size headers=functionbackend.invoke_with_result('GetHead', argsOriginal) if 'error' in headers.keys(): print('File does not exist in target COS. Exiting program') exit(1) sizeFile = int(headers.get('content-length')) #Begin chunking print('All parameters obtained. Beginning chunking') currentByte=0 chunkCounter =0 chunkSize=(sizeFile//nchunks)+1 while currentByte<sizeFile: argsTemp['startbyte']=currentByte if (currentByte+chunkSize)>sizeFile: argsTemp['endbyte']=sizeFile else: argsTemp['endbyte']=currentByte+chunkSize argsTemp['targetFile']='temp'+str(chunkCounter) #mapper.main(argsTemp) #discomment this for linear mapping (requires mapper.py) functionbackend.invoke('Map', argsTemp) currentByte=currentByte+chunkSize+1 chunkCounter+=1 #Wait until all mappers have finished (all temp files created) print('All chunks sent. Waiting for results...') ##listObjects=cosbackend.list_objects('temp1') listObjects= (functionbackend.invoke_with_result('ListObjects', { 'bucket': 'temps1' , 'configCOS': configCOS})).get('files') timeout=0 while(len(listObjects)<nchunks): time.sleep(0.1) listObjects= (functionbackend.invoke_with_result('ListObjects', { 'bucket': 'temps1' , 'configCOS': configCOS})).get('files') timeout+=1 if (timeout>600): #Note: We discovered that with files that had specific complex UTF characters, there's a chance that the split will accidentally break up multi-byte #characters between two mappers, leaving both mappers confused and unable to return valid results. Therefore, the program assumes #that if a full minute goes by without changes, the mappers are probably stuck and the execution has failed print('Program timed out - Mapper found an issue. Please check the encoding in your file!') print('Please wait while we clean up temp files...') for elem in listObjects: functionbackend.invoke('Delete', {'bucket':'temps1', 'configCOS':configCOS, 'filename':elem}) print('All files cleaned up. Exiting program.') exit(1) #Begin reduction print('Mapping complete. Proceed to reduction') #result=reduce.main({'bucket':'temps1', 'configCOS':configCOS, 'op':op, 'prefix':'temp'}) functionbackend.invoke_with_result('Reducer', {'bucket':'temps1', 'configCOS':configCOS, 'op':op, 'prefix':'temp'}) result=(functionbackend.invoke_with_result('GetObject', {'bucket':'temps1', 'configCOS':configCOS, 'filename':'resultados'})).get('content') result=json.loads(result) #time measurement end and printing (for reference) f_time=datetime.now() print('Total elapsed time='+str(f_time-i_time)) #cleanup functionbackend.invoke('Delete', {'bucket':'temps1', 'configCOS':configCOS, 'filename':'resultados'}) return result
import sys from cos_backend import COS_Backend from ibm_cf_connector import CloudFunctions import yaml import time if __name__ == '__main__': with open('ibm_cloud_config.txt', 'r') as config_file: res = yaml.safe_load(config_file) ibm_cos = res['ibm_cos'] ibm_cf = res['ibm_cf'] cos = COS_Backend(ibm_cos) cloud = CloudFunctions(ibm_cf) fileread = sys.argv[1] partitions = int(sys.argv[2]) #Add required files to the cloud f = open(fileread, 'r', encoding='utf-8', errors='ignore') cos.put_object('sdprac1', fileread, f.read()) f = open('wordcount.zip', 'rb') cloud.create_action('wordcount', f.read()) f = open('reduce.zip', 'rb') cloud.create_action('reduce', f.read()) f = open('countwords.zip', 'rb') cloud.create_action('countwords', f.read())
def __init__(self, target_bucket, target_fname, upload=False): self.target_fname = target_fname self.target_bucket = target_bucket self.ini_error = False format_str = "cloudfunctions:\n 'endpoint': ''\n 'namespace': ''\n 'api_key': ''\nrabbitamqp:\n 'url': ''\ncos:\n service_endpoint: ''\n secret_key: ''\n access_key: ''" try: # load keys securely with open('secret.yaml', 'r') as f: secret = yaml.safe_load(f) # initialitze the remote storage wrapper, and upload the target file self.cb = COSBackend(secret['cos']['service_endpoint'], secret['cos']['secret_key'], secret['cos']['access_key']) if upload: target_file = open(self.target_fname, "rb") self.cb.put_object(target_bucket, target_fname, target_file.read()) target_file.close() # retrieve file length, ensure file has been uploaded try: self.fsize = int( self.cb.head_object(self.target_bucket, self.target_fname)['content-length']) except: print( 'File \'{}\' was not found in this bucket \'{}\'. Upload it and retry.' .format(self.target_fname, self.target_bucket)) self.ini_error = True return None # initialize the function wrapper config = {} config['endpoint'] = secret['cloudfunctions']['endpoint'] config['namespace'] = secret['cloudfunctions']['namespace'] config['api_key'] = secret['cloudfunctions']['api_key'] self.cf = CloudFunctions(config) # initialize the queue system self.pika_params = pika.URLParameters(secret['rabbitamqp']['url']) except KeyError: print('Wrong yaml document format. Please use the following one:') print(format_str) self.ini_error = True except FileNotFoundError as e: print('File \'{}\' not found.'.format(e.filename)) self.ini_error = True # set the common args stub self.comargs = {} self.comargs['cos'] = secret['cos'] self.comargs['rabbitamqp_url'] = secret['rabbitamqp']['url'] self.comargs['target_bucket'] = self.target_bucket self.comargs['target_fname'] = self.target_fname # two separate queues, the reducer waits for the mappers and the orchestrator waits for the reducer self.mapper_qid = 'mapperQueue' self.reducer_qid = 'reducerQueue'
class Orchestrator: def __init__(self, target_bucket, target_fname, upload=False): self.target_fname = target_fname self.target_bucket = target_bucket self.ini_error = False format_str = "cloudfunctions:\n 'endpoint': ''\n 'namespace': ''\n 'api_key': ''\nrabbitamqp:\n 'url': ''\ncos:\n service_endpoint: ''\n secret_key: ''\n access_key: ''" try: # load keys securely with open('secret.yaml', 'r') as f: secret = yaml.safe_load(f) # initialitze the remote storage wrapper, and upload the target file self.cb = COSBackend(secret['cos']['service_endpoint'], secret['cos']['secret_key'], secret['cos']['access_key']) if upload: target_file = open(self.target_fname, "rb") self.cb.put_object(target_bucket, target_fname, target_file.read()) target_file.close() # retrieve file length, ensure file has been uploaded try: self.fsize = int( self.cb.head_object(self.target_bucket, self.target_fname)['content-length']) except: print( 'File \'{}\' was not found in this bucket \'{}\'. Upload it and retry.' .format(self.target_fname, self.target_bucket)) self.ini_error = True return None # initialize the function wrapper config = {} config['endpoint'] = secret['cloudfunctions']['endpoint'] config['namespace'] = secret['cloudfunctions']['namespace'] config['api_key'] = secret['cloudfunctions']['api_key'] self.cf = CloudFunctions(config) # initialize the queue system self.pika_params = pika.URLParameters(secret['rabbitamqp']['url']) except KeyError: print('Wrong yaml document format. Please use the following one:') print(format_str) self.ini_error = True except FileNotFoundError as e: print('File \'{}\' not found.'.format(e.filename)) self.ini_error = True # set the common args stub self.comargs = {} self.comargs['cos'] = secret['cos'] self.comargs['rabbitamqp_url'] = secret['rabbitamqp']['url'] self.comargs['target_bucket'] = self.target_bucket self.comargs['target_fname'] = self.target_fname # two separate queues, the reducer waits for the mappers and the orchestrator waits for the reducer self.mapper_qid = 'mapperQueue' self.reducer_qid = 'reducerQueue' def run(self, mapper, nthreads): # check if initialization was good if self.ini_error: return -4 # validation of parameters if nthreads < 1: print( 'Minimum number of partitions or threads must be 1. \nExiting...' ) return -1 if mapper != 'CountingWords' and mapper != 'WordCount': print( '{} is not supported as a mapper yet. Supported mappers: CountingWords, WordCount. \nExiting...' .format(mapper)) return -2 # prepare arguments for the mapper (mapper args) chunk_size = int(self.fsize / nthreads) mapargs = self.comargs.copy() mapargs['qid'] = self.mapper_qid # stat connection with the queue system connection = pika.BlockingConnection(self.pika_params) channel = connection.channel() channel.queue_declare(queue=self.mapper_qid) channel.queue_purge( queue=self.mapper_qid) # ensure no message was left # measure time start_t = time.time() # dispatch mappers except the last one for i in range(0, nthreads - 1): mapargs['index'] = str(i) mapargs['Range'] = 'bytes={}-{}'.format(chunk_size * i, chunk_size * (i + 1)) self.cf.invoke(mapper, mapargs) #print('[{}]'.format(mapargs['index']), chunk_size*i, 'to', chunk_size*(i+1)) # dispatch the last mapper, so that it takes the rest of the file mapargs['index'] = nthreads - 1 mapargs['Range'] = 'bytes={}-{}'.format(chunk_size * (nthreads - 1), self.fsize) self.cf.invoke(mapper, mapargs) #print('[{}]'.format(mapargs['index']), chunk_size*(nthreads-1), 'to', self.fsize) # prepare arguments for the reducer (reducer args) redargs = self.comargs.copy() redargs['reduce_{}'.format(mapper)] = 'yes' redargs['nthreads'] = nthreads redargs['mapper_qid'] = self.mapper_qid redargs['reducer_qid'] = self.reducer_qid channel.queue_declare(queue=self.reducer_qid) channel.queue_purge( queue=self.reducer_qid) # ensure no message was left self.cf.invoke('Reducer', redargs) # wait for the reducer to finish channel.basic_consume(queue=self.reducer_qid, on_message_callback=SingleCallback()) channel.start_consuming() # measure time end_t = time.time() connection.close() print('Done.\nExecution time: {0:.5g}s'.format(end_t - start_t)) def claimFile(self, result_type, result_fname): # check if initialization was good if self.ini_error: return -4 try: result_file = open(result_fname, "w") cos_result = self.cb.get_object( self.target_bucket, '{}/{}-result'.format(self.target_fname, result_type)) result_file.write(cos_result.decode('utf-8')) result_file.close() except: print( 'Something went wrong, could not download result file for: {}, action: {}' .format(self.target_fname, result_type))
import shutil, yaml, os from ibm_cf_connector import CloudFunctions if __name__ == '__main__': #create zip files shutil.make_archive('reduce', 'zip', 'Reduce') shutil.make_archive('countingWords', 'zip', 'CountingWords') shutil.make_archive('wordCount', 'zip', 'WordCount') #load ibm_cloud_config file with open('ibm_cloud_config', 'r') as config_file: res = yaml.safe_load(config_file) #configure cloud function library cf = CloudFunctions(res['ibm_cf']) #create new actions, they are updated in case of being created already f = open('reduce.zip', 'rb') cf.create_action('reduce', f.read(), kind='python:3.7') f = open('wordCount.zip', 'rb') cf.create_action('wordCount', f.read(), kind='python:3.7') f = open('countingWords.zip', 'rb') cf.create_action('countingWords', f.read(), kind='python:3.7') #delete zip files os.remove('reduce.zip') os.remove('countingWords.zip') os.remove('wordCount.zip')
with open('ibm_cloud_config', 'r') as config_file: res = yaml.safe_load(config_file) if len(sys.argv) != 3: print( "Han d'haver dos parametres: El fitxer a analitzar i el numero de particions" ) sys.exit() bucket_name = input("Introdueixi el bucket name del IBM COS\n") program = int( input("Seleccioni programa:\n1. Counting Words\n2. Word Count\n")) if (program == 1 or program == 2): cos_backend = COSBackend(res.get('ibm_cos')) ibm_cf = CloudFunctions(res['ibm_cf']) file = sys.argv[1] file_size = int( cos_backend.head_object(bucket_name, file).get('content-length')) partition_size = file_size / int(sys.argv[2]) params = { 'program': program, 'file_name': file, 'cos_params': res.get('ibm_cos'), 'bucket_name': bucket_name } loop = asyncio.get_event_loop() tasks = [] initial_time = datetime.now() for i in range(int(sys.argv[2])):