def hySpeechRecognition(self, filename, content): resultQueue = Queue.Queue() s = Semaphore(0) t_api = thread(target=self.request_ifly_api, args=( filename, content, s, resultQueue, )) t_sdk = thread(target=self.request_ifly_sdk, args=( filename, content, s, resultQueue, )) t_api.setDaemon(True) t_sdk.setDaemon(True) t_sdk.start() t_api.start() s.acquire() result = resultQueue.get() try: return result except Exception as e: print(e) s.acquire() result = resultQueue.get() return result
class Pool(object): def __init__(self): self.semaphore = Semaphore(config.settings.QUEUE_SIZE) def queue_producer(self, producer): """Yields items as soon as the semaphore allows.""" try: for item in producer: self.semaphore.acquire() yield item except: logger.exception("Error in producer parallel task") def queue_consumer(self, consumer): """Returns item consumption function that signals the semaphore.""" def consumer_function(item): self.semaphore.release() try: consumer(item) except: logger.exception("Error in consumer parallel task") return consumer_function def parallelize(self, consumer, producer): """Implements a queued production of items to paralelize, limits RAM usage. imap() uses correctly the generator, is more memory efficient imap_unordered() does not wait on each item to be processed Args: consumer (function): Ingest and process items producer (generator): Yields items to be consumed """ logger.info("Starting paralelization") self.pool = ThreadPool(config.settings.NUM_CONCURRENT_WORKERS) self.pool.imap_unordered(self.queue_consumer(consumer), self.queue_producer(producer)) self.pool.close() self.pool.join() logger.info("Finishing paralelization")
def train(layer, logger, args, grad_queue, grad_queue2, targets_queue, e, data_size, trainloader, start_event, start_event2): criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(layer.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) optimizer.zero_grad() layer.train() def backward_rank0(semaphore, start_event2): start_event2.wait() batch_idx = 0 while True: try: semaphore.release() print("before grad recv") grad_recv = torch.zeros([args.batch_size, 256, 4, 4], dtype=torch.int8) dist.recv(tensor=grad_recv, src=1) print("after grad recv...") except RuntimeError as error: print("backward runtime error") break grad_recv = dequantize(grad_recv.cuda(0).float()) loss = outputs_queue.get(block=False) loss.backward(grad_recv) if batch_idx % args.buffer_size == 0: optimizer.step() optimizer.zero_grad() batch_idx += 1 def backward_rank1(semaphore, start_event, start_event2): start_event.wait() batch_idx = 0 while True: try: #semaphore.release() print("before grad recv...") grad_recv1 = torch.zeros([args.batch_size, 512, 2, 2], dtype=torch.int8) dist.recv(tensor=grad_recv1, src=2) print("after grad recv.....") except RuntimeError as error: print("backward runtime error") send_opt = dist.isend(tensor=torch.zeros(0), dst=0) send_opt.wait() break grad_recv1 = dequantize(grad_recv1.cuda(0).float()) inputs, outputs = outputs_queue.get(block=False) inputs.requires_grad_() outputs.backward(grad_recv1) if batch_idx % args.buffer_size == 0: optimizer.step() optimizer.zero_grad() inputs_grad = quantize(inputs.grad, char=True).cpu() print(inputs_grad.size()) if batch_idx == 0: start_event2.set() #send_opt = dist.isend(tensor=inputs_grad, dst=0) #send_opt.wait() dist.send(tensor=inputs_grad, dst=0) batch_idx += 1 if dist.get_rank() == 0: criterion.cuda(0) outputs_queue = ThreadQueue(args.buffer_size) semaphore = Semaphore(args.buffer_size) back_process = Process(target=backward_rank0, args=(semaphore, start_event2)) back_process.start() for batch_idx, (inputs, targets) in enumerate(trainloader): semaphore.acquire() print("batch: " + str(batch_idx)) inputs, targets = inputs.cuda(0), targets outputs = layer(inputs) targets_queue.put(targets.numpy()) outputs_queue.put(outputs) send_opt = dist.isend(tensor=q_act(outputs, char=True).cpu(), dst=1) send_opt.wait() print("send....") print("start to end..") send_opt = dist.isend(tensor=torch.zeros(0), dst=1) send_opt.wait() back_process.join() e.set() elif dist.get_rank() == 1: batch_idx = 0 criterion.cuda(0) outputs_queue = ThreadQueue(10) semaphore = Semaphore(args.buffer_size - 1) back_process = Process(target=backward_rank1, args=(semaphore, start_event, start_event2)) back_process.start() while True: try: print("before semaphore......") #semaphore.acquire() rec_val = torch.zeros([args.batch_size, 256, 4, 4], dtype=torch.int8) dist.recv(tensor=rec_val, src=0) print("after recv.....") except RuntimeError as error: print("runtime errror") send_opt = dist.isend(tensor=torch.zeros(0), dst=2) send_opt.wait() back_process.join() e.wait() break print("before dq...") rec_val = dq_act(rec_val) rec_val = rec_val.cuda(0) rec_val.requires_grad_() print("before output......") outputs = layer(rec_val) # if batch_idx % args.buffer_size == 0: # optimizer.step() # optimizer.zero_grad() print("before queue") outputs_queue.put([rec_val, outputs]) print("after queue") #send_opt = dist.isend(tensor=q_act(outputs, char=True).cpu(), dst=2) #send_opt.wait() dist.send(tensor=q_act(outputs, char=True).cpu(), dst=2) batch_idx += 1 print("send end...") elif dist.get_rank() == 2: batch_idx = 0 train_loss = 0 correct = 0 total = 0 criterion.cuda(0) while True: try: #print("before recv....") rec_val = torch.zeros([args.batch_size, 512, 2, 2], dtype=torch.int8) dist.recv(tensor=rec_val, src=1) #print("after recv.....") except RuntimeError as error: #traceback.format_exc(error) send_opt = dist.isend(tensor=torch.zeros(0), dst=1) send_opt.wait() e.wait() break rec_val = dq_act(rec_val) rec_val = rec_val.cuda(0) rec_val.requires_grad_() outputs = layer(rec_val) targets = targets_queue.get(block=True, timeout=2) targets = torch.from_numpy(targets).cuda(0) loss = criterion(outputs, targets) loss.backward() if batch_idx % args.buffer_size == 0: optimizer.step() train_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() progress_bar(batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100. * correct / total, correct, total)) optimizer.zero_grad() else: progress_bar(batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100. * correct / total, correct, total)) #if batch_idx % 10 == 0: logger.error("train:" + str(train_loss / (batch_idx + 1))) acc_str = "tacc: %.3f" % (100. * correct / total,) logger.error(acc_str) if batch_idx == 0: start_event.set() quantize_grad = quantize(rec_val.grad, char=True) #send_opt = dist.isend(tensor=quantize_grad.cpu(), dst=1) #send_opt.wait() dist.send(tensor=quantize_grad.cpu(), dst=1) batch_idx += 1