def inference(file_path, file_urls): result = dict() config = yaml.load(open("./assets/config.yaml", 'r'), Loader=yaml.FullLoader) pse = InferencePSE(config["pse_evaluation_parameter"], file_path) ocr = InferenceOCR(config["ocr_evaluation_parameter"], file_path) #Inference PSE pse_time = pse.run() #Crop image based on PSE output # Release the gpu memory cuda.select_device(int(config["pse_evaluation_parameter"]["gpu_list"])) cuda.close() print(file_path) CropPSE(file_path) #Inference OCR ocr_time = ocr.run() #Combining Result #CreateTxt(file_urls) for file_name in file_urls: result[file_name] = dict() txt_file = "./assets/demo/text/" + file_name.replace("jpg", "txt") img_file = file_path + file_name df, _, _, _ = create_df(txt_file) dict_cells, list_infos = create_cells(df) result[file_name]['df'] = create_DB(dict_cells, list_infos).drop( 'idx', axis=1).to_html(header="true") # Visualizer result[file_name]['img'] = connect_and_save(img_file, dict_cells, list_infos) return result
def worker(input_q, output_q): # Load a (frozen) Tensorflow model into memory. detection_graph = tf.Graph() with detection_graph.as_default(): od_graph_def = tf.GraphDef() with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid: serialized_graph = fid.read() od_graph_def.ParseFromString(serialized_graph) tf.import_graph_def(od_graph_def, name='') sess = tf.Session(graph=detection_graph) mtcnn = detect_and_align.create_mtcnn(sess, None) fps = FPS().start() while True: fps.update() frame = input_q.get() frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) face_patches, padded_bounding_boxes, landmarks = detect_and_align.detect_faces(frame_rgb, mtcnn) output = dict(face_boxes=padded_bounding_boxes) output_q.put(output) fps.stop() sess.close() cuda.select_device(0) cuda.close()
def gpu_dmt(cand, device=0): """ :param cand: Candidate object :param device: GPU id :return: """ cuda.select_device(device) chan_freqs = cuda.to_device(np.array(cand.chan_freqs, dtype=np.float32)) dm_list = cuda.to_device(np.linspace(0, 2 * cand.dm, 256, dtype=np.float32)) dmt_return = cuda.to_device(np.zeros((256, cand.data.shape[0]), dtype=np.float32)) cand_data_in = cuda.to_device(np.array(cand.data.T, dtype=np.uint8)) @cuda.jit def gpu_dmt(cand_data_in, chan_freqs, dms, cand_data_out, tsamp): ii, jj, kk = cuda.grid(3) if ii < cand_data_in.shape[0] and jj < cand_data_in.shape[1] and kk < dms.shape[0]: disp_time = int( -1 * 4148808.0 * dms[kk] * (1 / (chan_freqs[0]) ** 2 - 1 / (chan_freqs[ii]) ** 2) / 1000 / tsamp) cuda.atomic.add(cand_data_out, (kk, jj), cand_data_in[ii, (jj + disp_time) % cand_data_in.shape[1]]) threadsperblock = (16, 8, 8) blockspergrid_x = math.ceil(cand_data_in.shape[0] / threadsperblock[0]) blockspergrid_y = math.ceil(cand_data_in.shape[1] / threadsperblock[1]) blockspergrid_z = math.ceil(dm_list.shape[0] / threadsperblock[2]) blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z) gpu_dmt[blockspergrid, threadsperblock](cand_data_in, chan_freqs, dm_list, dmt_return, float(cand.tsamp)) cand.dmt = dmt_return.copy_to_host() cuda.close() return cand
def relase_GPU_memory(): K.clear_session() cuda.select_device(0) cuda.close() ses = K.get_session() config = tf.ConfigProto() K.tensorflow_backend.set_session(tf.Session(config=config))
def cuda_select_device(dev_i): try: cuda.close() except Exception as e: print(e) #pass cuda.select_device(dev_i)
def clear(self): K.clear_session() gc.collect() del self.model for gpu in range(len(cuda.gpus)): cuda.select_device(gpu) cuda.close()
def gpu_dmt_timeseries(dedisp_times, psr_data, max_delay, device=0): """ :param cand: Candidate object :param device: GPU id :return: """ cuda.select_device(device) dm_time = np.zeros((dedisp_times.shape[1], int(psr_data.shape[0]-max_delay)), dtype=np.float32) @cuda.jit(fastmath=True) def gpu_dmt(cand_data_in, all_delays, cand_data_out): ii, jj, kk = cuda.grid(3) if ii < cand_data_in.shape[0] and jj < cand_data_out.shape[1] and kk < all_delays.shape[1]: cuda.atomic.add(cand_data_out, (kk, jj), cand_data_in[ii, (jj + all_delays[ii,kk]) ]) #with cuda.pinned(dedisp_times, dm_time, psr_data): all_delays = cuda.to_device(dedisp_times) dmt_return = cuda.device_array(dm_time.shape, dtype=np.float32) cand_data_in = cuda.to_device(np.array(psr_data.T, dtype=psr_data.dtype)) threadsperblock = (4, 8, 32) blockspergrid_x = math.ceil(cand_data_in.shape[0] / threadsperblock[0]) blockspergrid_y = math.ceil(cand_data_in.shape[1] / threadsperblock[1]) blockspergrid_z = math.ceil(dedisp_times.shape[1] / threadsperblock[2]) blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z) gpu_dmt[blockspergrid, threadsperblock](cand_data_in, all_delays, dmt_return) dm_time = dmt_return.copy_to_host() #print(all_delays.shape) cuda.close() return dm_time
def gpu_dedisperse(cand, device=0): """ :param cand: Candidate object :param device: GPU id :return: """ cuda.select_device(device) chan_freqs = cuda.to_device(np.array(cand.chan_freqs, dtype=np.float32)) cand_data_in = cuda.to_device(np.array(cand.data.T, dtype=np.uint8)) cand_data_out = cuda.to_device(np.zeros_like(cand.data.T, dtype=np.uint8)) @cuda.jit def gpu_dedisp(cand_data_in, chan_freqs, dm, cand_data_out, tsamp): ii, jj = cuda.grid(2) if ii < cand_data_in.shape[0] and jj < cand_data_in.shape[1]: disp_time = int(-4148808.0 * dm * (1 / (chan_freqs[0]) ** 2 - 1 / (chan_freqs[ii]) ** 2) / 1000 / tsamp) cand_data_out[ii, jj] = cand_data_in[ii, (jj + disp_time) % cand_data_in.shape[1]] threadsperblock = (32, 32) blockspergrid_x = math.ceil(cand_data_in.shape[0] / threadsperblock[0]) blockspergrid_y = math.ceil(cand_data_in.shape[1] / threadsperblock[1]) blockspergrid = (blockspergrid_x, blockspergrid_y) gpu_dedisp[blockspergrid, threadsperblock](cand_data_in, chan_freqs, float(cand.dm), cand_data_out, float(cand.tsamp)) cand.dedispersed = cand_data_out.copy_to_host().T cuda.close() return cand
def main(): data = pd.read_csv( 'C:/Users/kdan/BigJob12/main_project/_db/data/model_data/working/to_reid.csv' ) image_path = 'C:/Users/kdan/BigJob12/main_project/_db/data/Preprocessed_data/' copy_path = 'C:/Users/kdan/BigJob12/main_project/_db/data/model_data/gallery/gallery_list/' shutil.rmtree(copy_path) if not os.path.isdir(copy_path[:-1]): os.mkdir(copy_path[:-1]) start = time.time() # 시작 시간 저장 for image_file_path in data['file_name']: try: shutil.copy(image_path + image_file_path, copy_path + image_file_path.split('/')[-1]) except: pass print(len(data['file_name'])) print("time :", time.time() - start) gc.collect() sys.stdout.flush() cuda.close() torch.cuda.empty_cache() # PyTorch thing
def CREATE_MODEL(self): try: cuda.select_device(0) cuda.close() except: pass try: tf.keras.backend.clear_session() except: pass self.outsize = self.dict['OTHERS']['1']['OUT_SIZE'] self.windowlength = self.dict['OTHERS']['1']['WINDOW_LEN'] self.MAX_window = self.dict['OTHERS']['1']['WINDOW_LEN'] self.batch =self.dict['OTHERS']['1']['BATCH_SIZE'] self.period = self.dict['OTHERS']['1']['PERIOD'] self.optimizer.learning_rate = self.dict['OTHERS']['1']['LR'] self.epochz = self.dict['OTHERS']['1']['EPOCHS'] if self.FIRST_ITER: self.CREATE_DATA() self.FIRST_ITER = False if ~(len(list(self.VARS_EX['OTHERS'].keys())) == 0 or list(self.VARS_EX['OTHERS'].keys()) == ['LR'] or list(self.VARS_EX['OTHERS'].keys()) == ['LR','EPOCHS'] or list(self.VARS_EX['OTHERS'].keys()) == ['EPOCHs']): self.CREATE_DATA() self.model_parallel() self.trainingz() self.SAVE_PLOTS() print(self.epochz)
def stupidconv_gpu(img, filt, padval): """ does convolution without using FFT because FFT is pissing me off and giving me weird answers :param img: :param filt: :param padval: :return: """ cuda.close() cuda.select_device(1) # get the number of nonzero entries in the filter for later averaging of result filt_nnz = np.count_nonzero(filt) # pad the images s_filt = filt.shape s_img = img.shape # appropriate padding depends on context # pad with filt size all around img pad_img = np.ones((s_img[0] + (2 * s_filt[0]), s_img[1] + (2 * s_filt[1])), dtype=np.float32) * padval pad_img[s_filt[0]: s_img[0] + s_filt[0], s_filt[1]: s_img[1] + s_filt[1]] = img output = np.zeros(pad_img.shape, dtype=np.float32) d_pad_img = cuda.to_device(pad_img) d_filt = cuda.to_device(filt) d_output = cuda.to_device(output) stupidconv_gpu_helper(d_pad_img, d_filt, s_img[0], s_img[1], s_filt[0], s_filt[1], d_output) output = d_output.copy_to_host() output = output[s_filt[0]:s_filt[0] + s_img[0], s_filt[1]:s_filt[1] + s_img[1]] return output / filt_nnz
def detect(self, image): cuda.select_device(0) config = ConfigProto() config.gpu_options.allow_growth = True session = InteractiveSession(config=config) ROOT_DIR = "/home/bernihoh/Bachelor/SMS/MaskRCNN/samples/SMSNetworks/face_feature_detection/" MODEL_DIR = os.path.join(ROOT_DIR, "logsFaceFeatureDetection") COCO_MODEL_PATH = "/home/bernihoh/Bachelor/SMS/MaskRCNN/samples/SMSNetworks/face_feature_detection/mask_rcnn_face_feature_detection_0029.h5" config = InferenceConfig() config.display() # Create model object in inference mode. model = modellib.MaskRCNN(mode="inference", model_dir=MODEL_DIR, config=config) # Load weights trained on MS-COCO model.load_weights(COCO_MODEL_PATH, by_name=True) class_names = ["bg", "iris_l", "inner_eye_l", "outer_eye_l", "eye_brow_l", "cheek_l", "iris_r", "inner_eye_r", "outer_eye_r", "eye_brow_r", "cheek_r", "nose_tip", "nose", "mouth", "chin", "face", "head", "distortion"] results = model.detect([image], verbose=1) r = results[0] session.close() cuda.close() return r
def get_prediction_real_time(sparkEngine, model=None, url_weight="", dim=15, prediction_weight="", encoder_length=24, decoder_length=24, attention_length=24, is_close_cuda=True): # continuously crawl aws and aqi & weather end = utils.get_datetime_now() end = end - timedelta(hours=1) # end = datetime.strptime("2018-06-19 11:01:00", p.fm) # e_ = end.strftime(p.fm) start = end - timedelta(hours=23) start = start.replace(minute=0, second=0, microsecond=0) # s_ = start.strftime(p.fm) # 2. process normalize data vectors, w_pred, china_vectors, timestamp = sparkEngine.process_vectors(start, end, dim) v_l = len(vectors) if v_l: sp_vectors = psv.convert_data_to_grid_exe(vectors) if v_l < encoder_length: sp_vectors = np.pad(sp_vectors, ((encoder_length - v_l,0), (0,0), (0,0), (0, 0)), 'constant', constant_values=0) # repeat for 25 districts if w_pred: w_pred = np.repeat(np.expand_dims(w_pred, 1), p.grid_size, 1) de_vectors = psv.convert_data_to_grid_exe(w_pred) # pad to fill top elements of decoder vectors de_vectors = np.pad(de_vectors, ((0, 0), (0, 0), (0, 0), (6, 0)), 'constant', constant_values=0) else: # know nothing about future weather forecast de_vectors = np.zeros((decoder_length, p.grid_size, p.grid_size, dim)) sp_vectors = np.concatenate((sp_vectors, de_vectors), axis=0) c_l = len(china_vectors) if c_l < attention_length: # print(attention_length - c_l) china_vectors = np.pad(china_vectors, ((attention_length - c_l, 0), (0, 0)), 'constant', constant_values=0) # 4. Feed to model if model is None: # model = BaselineModel(encoder_length=encoder_length, encode_vector_size=12, batch_size=1, decoder_length=decoder_length, rnn_layers=1, # dtype='grid', grid_size=25, use_cnn=True) # model.set_data(sp_vectors, [0], None) # model = MaskGan(encoder_length=encoder_length, encode_vector_size=15, batch_size=1, decode_vector_size=9, grid_size=25, use_cnn=True) model = APGan(encoder_length=24, decoder_length=24, encode_vector_size=15, batch_size=1, decode_vector_size=9, grid_size=25, forecast_factor=0) # model = APNet(encoder_length=24, decoder_length=24, encode_vector_size=15, batch_size=1, decode_vector_size=9, grid_size=25, forecast_factor=0) model.set_data(sp_vectors, [0], None, china_vectors) with tf.device('/%s' % p.device): model.init_ops(is_train=False) saver = tf.train.Saver() tconfig = get_gpu_options(False) with tf.Session(config=tconfig) as session: model.assign_datasets(session) preds_pm25 = realtime_execute(model, session, saver, decoder_length, p.prediction_weight_pm25) model.forecast_factor = 1 preds_pm10 = realtime_execute(model, session, saver, decoder_length, p.prediction_weight_pm10) china_vectors = np.array(china_vectors) # print("china", china_vectors.shape) # tf.reset_default_graph() # session.close() if is_close_cuda: cuda.select_device(0) cuda.close() return (preds_pm25, preds_pm10), timestamp, np.transpose(china_vectors[:,:2] * 500) else: return ([],[]), [], []
def cleanup(): from keras import backend as K K.clear_session() from numba import cuda cuda.select_device(0) cuda.close()
def testin(): N = 2000 M = 2000 h = np.asarray(np.float32(2) + np.random.random((N, M)), dtype=np.float32) n = np.asarray(np.random.random((N, M)), dtype=np.float32) u = np.asarray(np.random.random((N + 1, M)), dtype=np.float32) v = np.asarray(np.random.random((N, M + 1)), dtype=np.float32) f = np.asarray(np.random.random((N, M)), dtype=np.float32) dx = np.float32(0.1) dy = np.float32(0.2) #p.g = np.float32(1.0) nu = np.float32(1.0) out_u = np.asarray(np.random.random((M, N + 1)), dtype=np.float32) threadsperblock = (16, 32) # (16,16) blockspergrid_x = (u.shape[0] + threadsperblock[0]) // threadsperblock[0] blockspergrid_y = (u.shape[1] + threadsperblock[1]) // threadsperblock[1] blockspergrid = (blockspergrid_x, blockspergrid_y) print("here we go", u.shape) print("blocks per grid", blockspergrid) print("threads per block", threadsperblock) try: for cu_u_driver in (cu_u_driver_global, ): print(cu_u_driver) # h1 = cuda.to_device(h) # n1 = cuda.to_device(n) # u1 = cuda.to_device(u) v1 = cuda.to_device(v) # f1 = cuda.to_device(f) out_u1 = cuda.to_device(out_u) ts = [] for i in range(10): t = mytime() # time.process_time() for j in range(100): cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cu_u_driver[blockspergrid, threadsperblock](v1, out_u1) cuda.synchronize() t2 = mytime() # time.process_time() ts.append(t - t2) # time.sleep(1) print("cuda") print(np.median(ts), np.min(ts), np.max(ts), np.std(ts)) print(ts) finally: print("cuda closer") cuda.close() print("all done")
def gpu_dmt(cand, device=0): """ GPU DM-Time bow-tie (by rolling the array) Args: cand: Candidate instance device (int): GPU ID Returns: candidate object """ cuda.select_device(device) chan_freqs = cuda.to_device(np.array(cand.chan_freqs, dtype=np.float32)) dm_list = cuda.to_device(np.linspace(0, 2 * cand.dm, 256, dtype=np.float32)) dmt_return = cuda.to_device(np.zeros((256, cand.data.shape[0]), dtype=np.float32)) cand_data_in = cuda.to_device(np.array(cand.data.T, dtype=cand.data.dtype)) @cuda.jit def gpu_dmt(cand_data_in, chan_freqs, dms, cand_data_out, tsamp): ii, jj, kk = cuda.grid(3) if ( ii < cand_data_in.shape[0] and jj < cand_data_in.shape[1] and kk < dms.shape[0] ): disp_time = int( -1 * 4148808.0 * dms[kk] * (1 / (chan_freqs[0]) ** 2 - 1 / (chan_freqs[ii]) ** 2) / 1000 / tsamp ) cuda.atomic.add( cand_data_out, (kk, jj), cand_data_in[ii, (jj + disp_time) % cand_data_in.shape[1]], ) threadsperblock = (16, 8, 8) blockspergrid_x = math.ceil(cand_data_in.shape[0] / threadsperblock[0]) blockspergrid_y = math.ceil(cand_data_in.shape[1] / threadsperblock[1]) blockspergrid_z = math.ceil(dm_list.shape[0] / threadsperblock[2]) blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z) gpu_dmt[blockspergrid, threadsperblock]( cand_data_in, chan_freqs, dm_list, dmt_return, float(cand.your_header.tsamp) ) cand.dmt = dmt_return.copy_to_host() cuda.close() return cand
def newthread(): cuda.select_device(0) stream = cuda.stream() A = np.arange(100) dA = cuda.to_device(A, stream=stream) stream.synchronize() del dA del stream cuda.close()
def newthread(): devices = range(driver.get_device_count()) print('Devices', devices) for _ in range(2): for d in devices: cuda.select_device(d) print('Selected device', d) cuda.close() print('Closed device', d)
def newthread(exception_queue): try: devices = range(driver.get_device_count()) for _ in range(2): for d in devices: cuda.select_device(d) cuda.close() except Exception as e: exception_queue.put(e)
def clear(self): ''' Used to clear the current session from the GPU. ''' K.clear_session() gc.collect() del self.model for gpu in range(len(cuda.gpus)): cuda.select_device(gpu) cuda.close()
def mpi_fq(n_nodes, m_list, q, scatter_array, qbin): """ Breakup the job across the GPU enabled nodes Parameters ---------- n_nodes: int Number of allocated nodes, not including the head node Returns ------- list of floats: Amount of memory per GPU """ from mpi4py import MPI kernel_loc = inspect.getfile(mpi_fq_worker) comm = MPI.COMM_WORLD.Spawn( sys.executable, args=[kernel_loc], maxprocs=n_nodes ) n_cov = 0 status = MPI.Status() m_list += ([StopIteration] * n_nodes) p = None thread_q = [] for m in m_list: if m is StopIteration: msg = m else: msg = (q, scatter_array, qbin, m, n_cov) # If the thread on the main node is done, or not started: # give a problem to it if p is None or p.is_alive() is False: cuda.close() p = Thread( target=subs_fq, args=( cuda.gpus.lst[0], q, scatter_array, thread_q, qbin, m, n_cov)) p.start() else: comm.recv(source=MPI.ANY_SOURCE, status=status) comm.send(obj=msg, dest=status.Get_source()) if type(m) == int: n_cov += m p.join() # Make certain we have covered all the atoms assert n_cov == len(q) # TODO: Make Numpy based Gather for faster memory transfer or Sum Reduce reports = comm.gather(root=MPI.ROOT) comm.Disconnect() reports += thread_q return reports
def test_api_post(): path = "C:\\Users\\CAU\\Desktop\\capstone\\text_recognition\demo_image" if os.path.exists(path): for file in os.scandir(path): os.remove(file.path) imagefile = request.files['image'] filename = werkzeug.utils.secure_filename(imagefile.filename) print("\nReceived image File name : " + imagefile.filename) imagefile.save("./text_detection/test/" + filename) # time.sleep(5) detection.run_detection() # time.sleep(5) img_files, img_bbox = load_files() crop_img(img_files, img_bbox) pred_str = recognition.run_recognition() # underline detection cfg = PredictionConfig() # define the model model = MaskRCNN(mode='inference', model_dir='./', config=cfg) # load model weights model_path = 'mask_rcnn_underline_cfg_0020.h5' model.load_weights(model_path, by_name=True) temp = cv2.imread("./text_detection/test/androidFlask.jpg") yhat = model.detect([temp], verbose=0)[0] print(len(yhat['rois'])) # [l, t], [r, t], [r, b], [l, b] for i, file in enumerate(img_files): txt = pd.read_csv(img_bbox[i], header=None) df = pd.DataFrame(columns=["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4", "result_text"]) # compare for i, bb in enumerate(txt.values): x1, y1, x2, y2, x3, y3, x4, y4 = bb # textbb = [x1, y1, x3, y3] for underline in yhat['rois']: uy1, ux1, uy2, ux2 = underline if (ux1 + ux2) / 2 > x1 and (ux1 + ux2) / 2 < x3 and y1 < uy1 and uy1 < y3: df = df.append({"x1": x1, "y1": y1, "x2": x2, "y2": y2, "x3": x3, "y3": y3, "x4": x4, "y4": y4, "result_text": pred_str[i]}, ignore_index=True) temp = cv2.rectangle(temp, (x1, y1), (x3, y3), (0, 0, 255), 3) # top-left corner and bottom-right corner of rectangle. df.to_csv("./result.csv") cv2.imwrite("./result.jpg", temp) from keras import backend as K K.clear_session() cuda.select_device(0) cuda.close() del model return "done"
def main(argv): generate_trajectory() learn_direction() if FLAGS.evaluation in ['qualitative', 'quantitative']: latent_traversal() if FLAGS.evaluation == 'quantitative': cuda.select_device(0) cuda.close() get_barycenter() measure_perf()
def newthread(exception_queue): try: cuda.select_device(0) stream = cuda.stream() A = np.arange(100) dA = cuda.to_device(A, stream=stream) stream.synchronize() del dA del stream cuda.close() except Exception as e: exception_queue.put(e)
def GPU_switch(GPU): if GPU == False: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" tf.keras.backend.clear_session() else: #watch nvidia-smi cuda.select_device(0) cuda.close() print('CUDA memory released: GPU0') gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True)
def newthread(exception_queue): try: devices = range(driver.get_device_count()) print('Devices', devices) for _ in range(2): for d in devices: cuda.select_device(d) print('Selected device', d) cuda.close() print('Closed device', d) except Exception as e: exception_queue.put(e)
def gpu_dedisperse(cand, device=0): """ GPU dedispersion (by rolling the array) Args: cand: Candidate instance device (int): GPU ID Returns: candidate object """ cuda.select_device(device) chan_freqs = cuda.to_device(np.array(cand.chan_freqs, dtype=np.float32)) cand_data_in = cuda.to_device(np.array(cand.data.T)) cand_data_out = cuda.to_device(np.zeros_like(cand.data.T)) @cuda.jit def gpu_dedisp(cand_data_in, chan_freqs, dm, cand_data_out, tsamp): ii, jj = cuda.grid(2) if ii < cand_data_in.shape[0] and jj < cand_data_in.shape[1]: disp_time = int( -4148808.0 * dm * (1 / (chan_freqs[0]) ** 2 - 1 / (chan_freqs[ii]) ** 2) / 1000 / tsamp ) cand_data_out[ii, jj] = cand_data_in[ ii, (jj + disp_time) % cand_data_in.shape[1] ] threadsperblock = (32, 32) blockspergrid_x = math.ceil(cand_data_in.shape[0] / threadsperblock[0]) blockspergrid_y = math.ceil(cand_data_in.shape[1] / threadsperblock[1]) blockspergrid = (blockspergrid_x, blockspergrid_y) gpu_dedisp[blockspergrid, threadsperblock]( cand_data_in, chan_freqs, float(cand.dm), cand_data_out, float(cand.your_header.tsamp), ) cand.dedispersed = cand_data_out.copy_to_host().T cuda.close() return cand
def reset_keras(device=0): cuda.select_device(device) cuda.close() print(gc.collect()) # if it's done something you should see a number being outputted K.clear_session() sess = K.get_session() sess.close() # use the same config as you used to create the session config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 1 config.gpu_options.visible_device_list = "0" K.set_session(tf.Session(config=config))
def Clear(): from numba import cuda device = cuda.get_current_device() device.reset() #cuda.current_context().trashing.clear() s = cuda.current_context().get_memory_info() print(s) cuda.current_context().deallocations.clear() s = cuda.current_context().get_memory_info() print(s) cuda.select_device(0) #do tf stuff cuda.close()
def mpi_fq(n_nodes, m_list, q, scatter_array, qbin): """ Breakup the job across the GPU enabled nodes Parameters ---------- n_nodes: int Number of allocated nodes, not including the head node Returns ------- list of floats: Amount of memory per GPU """ from mpi4py import MPI kernel_loc = inspect.getfile(mpi_fq_worker) comm = MPI.COMM_WORLD.Spawn(sys.executable, args=[kernel_loc], maxprocs=n_nodes) n_cov = 0 status = MPI.Status() m_list += ([StopIteration] * n_nodes) p = None thread_q = [] for m in m_list: if m is StopIteration: msg = m else: msg = (q, scatter_array, qbin, m, n_cov) # If the thread on the main node is done, or not started: # give a problem to it if p is None or p.is_alive() is False: cuda.close() p = Thread(target=subs_fq, args=(cuda.gpus.lst[0], q, scatter_array, thread_q, qbin, m, n_cov)) p.start() else: comm.recv(source=MPI.ANY_SOURCE, status=status) comm.send(obj=msg, dest=status.Get_source()) if type(m) == int: n_cov += m p.join() # Make certain we have covered all the atoms assert n_cov == len(q) # TODO: Make Numpy based Gather for faster memory transfer or Sum Reduce reports = comm.gather(root=MPI.ROOT) comm.Disconnect() reports += thread_q return reports
def clear_context(self) -> None: try: print("Clearing Context") devices_list: List[cuda.cudadrv.devices. _DeviceContextManager] = cuda.list_devices().lst for device in devices_list: print("GPU device id:{}".format(device.id)) cuda.select_device(device.id) cuda.close() device.reset() except cuda.cudadrv.error.CudaSupportError as e: pass finally: print("Context Cleared")
__author__ = 'christopher' if __name__ == '__main__': from mpi4py import MPI from numba import cuda comm = MPI.Comm.Get_parent() rank = comm.Get_rank() meminfo = int(cuda.current_context().get_memory_info()[0]) cuda.close() comm.gather(sendobj=meminfo, root=0) comm.Disconnect()
def setUp(self): # Reset before testing cuda.close()
def tearDown(self): cuda.close()