def main(args): # initialize cos wrapper cb = COSBackend(args['cos']['service_endpoint'], args['cos']['secret_key'], args['cos']['access_key']) # initialize queue system for the mappers' queue pika_params = pika.URLParameters(args['rabbitamqp_url']) connection = pika.BlockingConnection(pika_params) channel = connection.channel() channel.queue_declare(queue=args['mapper_qid']) # check what we are reducing if 'reduce_WordCount' in args and args['reduce_WordCount'] == 'yes': callback = ReduceCallback(cb, args['target_bucket'], args['nthreads']) # create a callback channel.basic_consume(callback, queue=args['mapper_qid']) # set a callback channel.start_consuming() cb.put_object(args['target_bucket'], '{}/WC-result'.format(args['target_fname']), json.dumps(callback.result)) # commit result if 'reduce_CountingWords' in args and args['reduce_CountingWords'] == 'yes': callback = ReduceCallback(cb, args['target_bucket'], args['nthreads']) channel.basic_consume(callback, queue=args['mapper_qid']) channel.start_consuming() cb.put_object(args['target_bucket'], '{}/CW-result'.format(args['target_fname']), json.dumps(callback.result)) # tell the orchestrator job is done channel.basic_publish(exchange='', routing_key=args['reducer_qid'], body='OK') connection.close()
def funcio_unica(minim): cos = COSBackend() minim = 0 maxim = x while minim < maxim: #minim és la fila actual a tractar dada = "" for j in range(y): #j és la columna que estem tractant actualment #print(type(mat1[minim][j])) dada = dada + str(mat1[int(minim)][int(j)]) + "," dada = dada[:-1] dada = dada.encode() cos.put_object('sd-ori-un-buen-cubo', 'fila' + str(int(minim) + 1) + '.txt', dada) minim += 1 maxim = z minim = 0 while minim < maxim: #minim és la fila actual a tractar dada = "" for j in range(y): #j és la columna que estem tractant actualment #print(type(mat1[minim][j])) dada = dada + str(mat2[int(j)][int(minim)]) + "," dada = dada[:-1] dada = dada.encode() cos.put_object('sd-ori-un-buen-cubo', 'col' + str(int(minim) + 1) + '.txt', dada) minim += 1
def master(x, ibm_cos): obj = COSBackend(config=ibm_cos) def order(e): return e['LastModified'] write_permission_list = [] m = [] finish = 0 obj.put_object('practise2', 'result.json', json.dumps(m)) l = obj.list_objects('practise2', 'p_write') while (not finish): l.sort(key=order) current_id = l.pop(0) current_id = current_id['Key'] file_to_write = current_id[2:] date_json = obj.list_objects('practise2', 'result.json')[0]['LastModified'] obj.put_object('practise2', file_to_write, "") obj.delete_object('practise2', "p_" + file_to_write) write_permission_list.append(int(file_to_write[7:-1])) next = 0 while (not next): time.sleep(X / 4) if (not obj.list_objects('practise2', 'result.json')[0]['LastModified'] == date_json): next = 1 obj.delete_object('practise2', file_to_write) time.sleep(X) l = obj.list_objects('practise2', 'p_write') if (not l): finish = 1 return write_permission_list
def main(args): # initialize cos wrapper cb = COSBackend(args['cos']['service_endpoint'], args['cos']['secret_key'], args['cos']['access_key']) # fetch the assigned range of bytes and parse that chunk into words to then count the number of occurrences of each word # ( by the way, this must be done in one line (as a r-value) so that the object returned by the cb.get_object method gets # free'd by the garbage collector ASAP, therefore reserved memory doesn't stack up too much ) words = re.findall( r'\w+', cb.get_object(args['target_bucket'], args['target_fname'], extra_get_args={ 'Range': args['Range'] }).decode('UTF-8', errors='ignore')) result = {} for word in words: adapted_word = word.lower() #unidecode.unidecode(word).lower() if adapted_word in result: result[adapted_word] += 1 else: result[adapted_word] = 1 # commit result on the cloud result_tag = '{}/CW-result-{}'.format(args['target_fname'], args['index']) cb.put_object(args['target_bucket'], result_tag, json.dumps(result)) # notify via queue, message = result file name on the cloud pika_params = pika.URLParameters(args['rabbitamqp_url']) connection = pika.BlockingConnection(pika_params) channel = connection.channel() channel.basic_publish(exchange='', routing_key=args['qid'], body=result_tag) connection.close()
def generatex(x,y,z,a): cos=COSBackend() matrixA=[] matrixB=[] for m_value in range(x): valors=[] for n_value in range(y): valors.append(random.randint(0,10)) matrixA.append(valors) for n_value in range(y): valors=[] for l_value in range(z): valors.append(random.randint(0,10)) matrixB.append(valors) cos.put_object('practica-sd-mp', 'matrixA.txt', pickle.dumps(matrixA)) cos.put_object('practica-sd-mp', 'matrixB.txt', pickle.dumps(matrixB)) for i in range(nWorkersA): if(mImpar!=0 and i==nWorkersA-1): filesA=matrixA[i*a:] else: filesA=matrixA[i*a:i*a+a] for j in range(nWorkersB): columnesB=[] if(lImpar!=0 and j==nWorkersB-1): columnesTotals=lImpar else: columnesTotals=a for k in range(columnesTotals): columna=[item[j*a+k] for item in matrixB] columnesB.append(columna) #ja tinc les files i les columnes infoWorkers=[] infoWorkers.append(filesA) infoWorkers.append(columnesB) cos.put_object('practica-sd-mp', f'{i}w{j}', pickle.dumps(infoWorkers))
def slave(id, x, ibm_cos): obj = COSBackend(config=ibm_cos) obj.put_object('practise2', "p_write_{" + str(id) + "}", b"") my_turn = 0 while (not my_turn): time.sleep(X) if (obj.list_objects('practise2', 'write_{' + str(id) + '}')): my_turn = 1 result_file = json.loads(obj.get_object('practise2', 'result.json')) result_file.append(id) obj.put_object('practise2', 'result.json', json.dumps(result_file))
def map_count_words(file, args): cos_params = args.get('cos_params') num_partition = args.get('num_partition') bucket_name = args.get('bucket_name') file_name = args.get('file_name') cos = COSBackend(cos_params) num_words = len(file.split()) file_to_create = "cw_" + file_name + str(num_partition) cos.put_object(bucket_name, file_to_create, str(num_words)) return {'finish': "OK"}
def toRows(mat): cos = COSBackend() #storage the rows of matrix A (AxB) to the bucket for x in range(0, dim1): row = mat[x, :] memfile = io.BytesIO() numpy.save(memfile, row) memfile.seek(0) serialized = json.dumps(memfile.read().decode('latin-1')) cos.put_object('cuc-bucket', 'A' + str(x), serialized)
def toColumns(mat): cos = COSBackend() #storage the columns of matrix B (AxB) to the bucket for x in range(0, dim3): column = mat[:, x] memfile = io.BytesIO() numpy.save(memfile, column) memfile.seek(0) serialized = json.dumps(memfile.read().decode('latin-1')) cos.put_object('cuc-bucket', 'B' + str(x), serialized)
def ensamblar(results): global n,l,work cos = COSBackend() if (n%work != 0 or l%work != 0) and (work < l or work < n): array = [] for result in results: array = np.append(array,result) final = np.reshape(array, (n, l)) else: final = np.reshape(results, (n, l)) cos.put_object('____', 'matrizFinal', pickle.dumps(final, pickle.HIGHEST_PROTOCOL)) return final
def multiplication_reduce(results): cos=COSBackend() matrixC=[] #quan acabi aquest for ja haurem tractat tots els casos for indexWorkerFila in range(nWorkersA): for numeroFila in range(len(results[indexWorkerFila*nWorkersB])): fila=[] for indexWorkerColumna in range(nWorkersB): contadorWorker=indexWorkerFila*nWorkersB+indexWorkerColumna for valor in results[contadorWorker][numeroFila]: fila.append(valor) matrixC.append(fila) cos.put_object('practica-sd-mp','matrixC.txt', pickle.dumps(matrixC))
def funcio_normal_fila(minim): #minim és la fila on comença a tractar cos = COSBackend() maxim = int(minim) + porcio_basica_fil while minim < maxim: #minim és la fila actual a tractar dada = "" for j in range(y): #j és la columna que estem tractant actualment #print(type(mat1[minim][j])) dada = dada + str(mat1[int(minim)][int(j)]) + "," dada = dada[:-1] dada = dada.encode() cos.put_object('sd-ori-un-buen-cubo', 'fila' + str(int(minim) + 1) + '.txt', dada) minim += 1
def my_reduce_function(results): cos = COSBackend() matrix = numpy.zeros((dim1, dim3)) #generate final matrix from parcial results for xResult in results: for map_result in xResult: matrix[map_result[1], map_result[2]] = map_result[0] #put the fianl matrix to bucket memfile = io.BytesIO() numpy.save(memfile, matrix) memfile.seek(0) serialized = json.dumps(memfile.read().decode('latin-1')) cos.put_object('cuc-bucket', 'matriu_result', serialized) return matrix
def reduce_word_count(args): file_name = args.get('file_name') num_partitions = args.get('num_partitions') cos = COSBackend(args.get('cos_params')) bucket_name = args.get('bucket_name') result_dict = {} for i in range(num_partitions): file = "wc_" + file_name + str(i) file_dict = json.loads(cos.get_object(bucket_name, file)) cos.delete_object(bucket_name, file) result_dict = { key: result_dict.get(key, 0) + file_dict.get(key, 0) for key in set(result_dict) | set(file_dict) } cos.put_object(bucket_name, "final_" + file_name, json.dumps(result_dict)) return {'finish': "OK"}
def generateMatrix(name, pos, dimf, dimc): numpy.random.seed() cos = COSBackend() #generate random matrix mat_original = numpy.random.randint(MAX_RANDOM, size=(dimf, dimc)) #upload to cloud memfile = io.BytesIO() numpy.save(memfile, mat_original) memfile.seek(0) serialized = json.dumps(memfile.read().decode('latin-1')) cos.put_object('cuc-bucket', name, serialized) if pos is 'A': toRows(mat_original) else: toColumns(mat_original)
def push_matrix(m, n, l, w): obj = COSBackend(dic) # Generate random matrix matrix1 = np.random.randint(-5, 6, size=(m, n)) matrix2 = np.random.randint(-5, 11, size=(n, l)) # Push A submatrix for i in range(0, w): infM = int((i * m) / w) supM = int((((i + 1) * m) / w) - 1) submatrix1 = matrix1[infM:supM + 1, :] obj.put_object('prac1', 'A' + str(i) + '.mtx', pickle.dumps(submatrix1)) # Push B submatrix for j in range(0, w): infL = int((j * l) / w) supL = int((((j + 1) * l) / w) - 1) submatrix2 = matrix2[:, infL:supL + 1] obj.put_object('prac1', 'B' + str(j) + '.mtx', pickle.dumps(submatrix2))
def reduce_count_words(args): file_name = args.get('file_name') num_partitions = args.get('num_partitions') cos = COSBackend(args.get('cos_params')) bucket_name = args.get('bucket_name') total_words = 0 for i in range(num_partitions): file = "cw_" + file_name + str(i) total_words += int(cos.get_object(bucket_name, file)) cos.delete_object(bucket_name, file) cos.put_object(bucket_name, "final_" + file_name, str(total_words)) #for i in range(num_partitions): # file_to_delete = "cw_"+file_name+str(i) # cos.delete_object(bucket_name, file_to_delete) return {'finish': "OK"}
def map_word_count(file, args): cos_params = args.get('cos_params') num_partition = args.get('num_partition') bucket_name = args.get('bucket_name') file_name = args.get('file_name') cos = COSBackend(cos_params) split_file = file.split() new_dict = {} for word in split_file: paraula = str(word) if paraula not in new_dict.keys(): new_dict[paraula] = 1 else: new_dict[paraula] += 1 cos.put_object(bucket_name, str("wc_" + file_name + str(num_partition)), json.dumps(new_dict)) return {'finish': "OK"}
def incializar(n, m, l, rang, work): cos = COSBackend() iterdata = [] num = work if (num > n or num > l) and num != n * l: # si es valor no valido (mayor que los validos, diferente a su multiplo) num = n num2 = l else: num2 = num matrizA = [[(np.random.randint(rang)) for i in range(m)] for j in range(n)] matrizB = [[(np.random.randint(rang)) for i in range(l)] for j in range(m)] array = np.array_split(matrizA, num) array2 = np.array_split(np.transpose(matrizB), num2) for i in range(num): name = "fil" + str(i) cos.put_object('____', name, pickle.dumps(array[i], pickle.HIGHEST_PROTOCOL)) for j in range(num2): name = "col" + str(j) cos.put_object('____', name, pickle.dumps(np.transpose(array2[j]), pickle.HIGHEST_PROTOCOL)) if ( work == (l*n) ): for i in range(num): for j in range(num2): array = [] array.append("fil" + str(i)) array.append("col" + str(j)) iterdata.append([array]) else: for i in range(num): array = [] for j in range(num2): array.append("fil" + str(i)) array.append("col" + str(j)) iterdata.append([array]) return iterdata
def matrix_ini(x, n, m, l, iterdata): cos = COSBackend() np.random.seed() A = np.random.randint(2 * x, size=(m, n)) - x B = np.random.randint(2 * x, size=(n, l)) - x #Subida de datos de forma secuencial if WORKERS == 1: cos.put_object(BUCKET, '/secuencial/A', p.dumps(A, p.HIGHEST_PROTOCOL)) cos.put_object(BUCKET, '/secuencial/B', p.dumps(B, p.HIGHEST_PROTOCOL)) #Subida de datos de forma paralela else: #Dividir matriz A en paquetes según el número de workers for i in iterdata: i = str(i).split('|') #Obtener posición de inicio del worker op_ini = i[1].split(',') op_ini[0] = int(op_ini[0]) #Obtener posición final del worker op_fi = i[2].split(',') op_fi[0] = int(op_fi[0]) + 1 cos.put_object( BUCKET, '/paralelo/f' + i[0], p.dumps(A[op_ini[0]:op_fi[0], :], p.HIGHEST_PROTOCOL)) #Subir matriz B entera cos.put_object(BUCKET, '/secuencial/B', p.dumps(B, p.HIGHEST_PROTOCOL))
def funcio_map(k): cos = COSBackend() """f=0 j=0""" #for i in range(len(iterdata)): #cont=0 k = k.split(" ") cont = 0 dada = '' for a in range(len(k) // int(2)): i = k[cont] j = k[cont + 1] cont += 2 fil = 'fila' + str(int(i) + 1) + '.txt' col = 'col' + str(int(j) + 1) + '.txt' fila = cos.get_object('sd-ori-un-buen-cubo', fil) columna = cos.get_object('sd-ori-un-buen-cubo', col) fila = fila.decode() columna = columna.decode() fila = fila.split(",") columna = columna.split(",") acum = 0 for b in range(len(fila)): acum += int(fila[b]) * int(columna[b]) dada += str(i) + " " + str(j) + ' ' + str(acum) + ' ' dada = dada[:-1] dada = dada.encode() cos.put_object('sd-ori-un-buen-cubo', 'worker' + k[len(k) - 1] + '.txt', dada) return (k[len(k) - 1])
def matrix_ini_paquetes(x, n, m, l, w, iterdata): cos = COSBackend() np.random.seed() A = np.random.randint(2 * x, size=(m, n)) - x B = np.random.randint(2 * x, size=(n, l)) - x #Subida de datos de forma secuencial if WORKERS == 1: cos.put_object(BUCKET, '/secuencial/A', p.dumps(A, p.HIGHEST_PROTOCOL)) cos.put_object(BUCKET, '/secuencial/B', p.dumps(B, p.HIGHEST_PROTOCOL)) #Subida de datos de forma paralela else: #Dividir matrices en filas y columnas for i in range(0, w): rang = str(iterdata[i]).split('|') op_ini = rang[1].split(',') op_ini[0] = int(op_ini[0]) op_ini[1] = int(op_ini[1]) op_fi = rang[2].split(',') op_fi[0] = int(op_fi[0]) op_fi[1] = int(op_fi[1]) Apar = np.zeros((op_fi[0] - op_ini[0] + 1, n), int) if (op_fi[0] - op_ini[0] ) >= 1: # Si la operación necesita dos filas o más if op_ini[1] < op_fi[1] or (op_fi[0] - op_ini[0]) > 1 or ( op_ini[1] == op_fi[1] and op_fi[0] - op_ini[0] == 1): # Si todas las columnas se ven afectadas cos.put_object(BUCKET, '/paralelo/B' + str(i), p.dumps(B, p.HIGHEST_PROTOCOL)) else: # Si no todas las columnas se ven afectadas a pesar de necesitar dos o más filas Bpar = np.zeros((n, ((op_fi[1] - op_ini[1]) % l) + 1), int) j = 0 while op_ini[1] != op_fi[ 1]: # Subimos aquellas que sean necesarias Bpar[:, j] = B[:, op_ini[1]] op_ini[1] = (op_ini[1] + 1) % L j = j + 1 Bpar[:, j] = B[:, op_ini[1]] cos.put_object(BUCKET, '/paralelo/B' + str(i), p.dumps(Bpar, p.HIGHEST_PROTOCOL)) else: # Subimos únicamente las columnas necesarias Bpar = np.zeros((n, ((op_fi[1] - op_ini[1]) % l) + 1), int) j = 0 while op_ini[1] != op_fi[1]: Bpar[:, j] = B[:, op_ini[1]] op_ini[1] = (op_ini[1] + 1) % L j = j + 1 Bpar[:, j] = B[:, op_ini[1]] cos.put_object(BUCKET, '/paralelo/B' + str(i), p.dumps(Bpar, p.HIGHEST_PROTOCOL)) j = 0 while op_ini[0] <= op_fi[ 0]: # Subimos las filas necesarias para la operación Apar[j, :] = A[op_ini[0], :] op_ini[0] = op_ini[0] + 1 j = j + 1 cos.put_object(BUCKET, '/paralelo/A' + str(i), p.dumps(Apar, p.HIGHEST_PROTOCOL))
def main(args): #get arguments s1 = json.dumps(args) args = json.loads(s1) res = args["res"] url = res["rabbitmq"]["url"] topRange = int(args["topRange"]) bottomRange = int(args["bottomRange"]) #configure COS library odb = COSBackend(res["ibm_cos"]) counts = Counter() #pika configuration params = pika.URLParameters(url) connection = pika.BlockingConnection(params) channel = connection.channel() channel.queue_declare(queue='WordCount') #Calcules a range which doesn't cut any word # if functionNumber = -1 means that is the last one so it has to analyse until the end # if functionNumber = 0 means that is the 1st one and it can't search before it if args["functionNumber"] != "-1": topRange = selectRange(args["fileName"], topRange, res) if args["functionNumber"] != '0': bottomRange = selectRange(args["fileName"], bottomRange, res) #get the part of the file that is needed in this function fileFromServer = odb.get_object(res["ibm_cos"]["bucket"], args["fileName"], extra_get_args={ "Range": "bytes={0}-{1}".format( bottomRange, topRange) }).decode('UTF-8', errors='ignore') #Delete unwanted characters stringSplitted = re.sub('[^A-Za-z \n]+', '', fileFromServer) #Split the string stringSplitted = re.split("\ |\n", stringSplitted) #Delete "" in array stringSplitted = list(filter(None, stringSplitted)) #convert array to count: # {word1:numberWord1, word2:numberWord2...wordN:numberWordN} counts.update(word.strip('.,?!"\'').lower() for word in stringSplitted) #count to dict diccionary = dict(counts) #dict to json dumped_json_string = json.dumps(diccionary) #upload file with result: # nameFile -> book + numberFunction # body -> json(dict(count)) odb.put_object(res["ibm_cos"]["bucket"], args["fileName"] + args["functionNumber"], dumped_json_string) #send a msg to reduce with the file name as body channel.basic_publish(exchange='', routing_key='WordCount', body=args["fileName"] + args["functionNumber"]) #close the connection connection.close() return {}
class Orchestrator: def __init__(self, target_bucket, target_fname, upload=False): self.target_fname = target_fname self.target_bucket = target_bucket self.ini_error = False format_str = "cloudfunctions:\n 'endpoint': ''\n 'namespace': ''\n 'api_key': ''\nrabbitamqp:\n 'url': ''\ncos:\n service_endpoint: ''\n secret_key: ''\n access_key: ''" try: # load keys securely with open('secret.yaml', 'r') as f: secret = yaml.safe_load(f) # initialitze the remote storage wrapper, and upload the target file self.cb = COSBackend(secret['cos']['service_endpoint'], secret['cos']['secret_key'], secret['cos']['access_key']) if upload: target_file = open(self.target_fname, "rb") self.cb.put_object(target_bucket, target_fname, target_file.read()) target_file.close() # retrieve file length, ensure file has been uploaded try: self.fsize = int( self.cb.head_object(self.target_bucket, self.target_fname)['content-length']) except: print( 'File \'{}\' was not found in this bucket \'{}\'. Upload it and retry.' .format(self.target_fname, self.target_bucket)) self.ini_error = True return None # initialize the function wrapper config = {} config['endpoint'] = secret['cloudfunctions']['endpoint'] config['namespace'] = secret['cloudfunctions']['namespace'] config['api_key'] = secret['cloudfunctions']['api_key'] self.cf = CloudFunctions(config) # initialize the queue system self.pika_params = pika.URLParameters(secret['rabbitamqp']['url']) except KeyError: print('Wrong yaml document format. Please use the following one:') print(format_str) self.ini_error = True except FileNotFoundError as e: print('File \'{}\' not found.'.format(e.filename)) self.ini_error = True # set the common args stub self.comargs = {} self.comargs['cos'] = secret['cos'] self.comargs['rabbitamqp_url'] = secret['rabbitamqp']['url'] self.comargs['target_bucket'] = self.target_bucket self.comargs['target_fname'] = self.target_fname # two separate queues, the reducer waits for the mappers and the orchestrator waits for the reducer self.mapper_qid = 'mapperQueue' self.reducer_qid = 'reducerQueue' def run(self, mapper, nthreads): # check if initialization was good if self.ini_error: return -4 # validation of parameters if nthreads < 1: print( 'Minimum number of partitions or threads must be 1. \nExiting...' ) return -1 if mapper != 'CountingWords' and mapper != 'WordCount': print( '{} is not supported as a mapper yet. Supported mappers: CountingWords, WordCount. \nExiting...' .format(mapper)) return -2 # prepare arguments for the mapper (mapper args) chunk_size = int(self.fsize / nthreads) mapargs = self.comargs.copy() mapargs['qid'] = self.mapper_qid # stat connection with the queue system connection = pika.BlockingConnection(self.pika_params) channel = connection.channel() channel.queue_declare(queue=self.mapper_qid) channel.queue_purge( queue=self.mapper_qid) # ensure no message was left # measure time start_t = time.time() # dispatch mappers except the last one for i in range(0, nthreads - 1): mapargs['index'] = str(i) mapargs['Range'] = 'bytes={}-{}'.format(chunk_size * i, chunk_size * (i + 1)) self.cf.invoke(mapper, mapargs) #print('[{}]'.format(mapargs['index']), chunk_size*i, 'to', chunk_size*(i+1)) # dispatch the last mapper, so that it takes the rest of the file mapargs['index'] = nthreads - 1 mapargs['Range'] = 'bytes={}-{}'.format(chunk_size * (nthreads - 1), self.fsize) self.cf.invoke(mapper, mapargs) #print('[{}]'.format(mapargs['index']), chunk_size*(nthreads-1), 'to', self.fsize) # prepare arguments for the reducer (reducer args) redargs = self.comargs.copy() redargs['reduce_{}'.format(mapper)] = 'yes' redargs['nthreads'] = nthreads redargs['mapper_qid'] = self.mapper_qid redargs['reducer_qid'] = self.reducer_qid channel.queue_declare(queue=self.reducer_qid) channel.queue_purge( queue=self.reducer_qid) # ensure no message was left self.cf.invoke('Reducer', redargs) # wait for the reducer to finish channel.basic_consume(queue=self.reducer_qid, on_message_callback=SingleCallback()) channel.start_consuming() # measure time end_t = time.time() connection.close() print('Done.\nExecution time: {0:.5g}s'.format(end_t - start_t)) def claimFile(self, result_type, result_fname): # check if initialization was good if self.ini_error: return -4 try: result_file = open(result_fname, "w") cos_result = self.cb.get_object( self.target_bucket, '{}/{}-result'.format(self.target_fname, result_type)) result_file.write(cos_result.decode('utf-8')) result_file.close() except: print( 'Something went wrong, could not download result file for: {}, action: {}' .format(self.target_fname, result_type))