def get_device(self): """Gathers an avaliable GPU or CPU for further processing. Returns: A configuration object containing the device's information. """ # Tries to check if there is an avaliable GPU try: # Gathers a list of GPUs gpus = GPUtil.getGPUs() # For each GPU for g in gpus: # Logs its information logging.info(g.name) # Calculates the load and memory per process load_per_process, mem_per_process = self.get_gpu_config() # Calculates the maximum possible load for an avaliable GPU max_load = 1 - load_per_process # # Calculates the maximum possible memory for an avaliable GPU max_mem = 1 - mem_per_process # Gathers the first avaliable GPU device_id = GPUtil.getFirstAvailable(order='first', maxLoad=max_load, maxMemory=max_mem, attempts=3, interval=3, verbose=False)[0] # Checks if the device id exists if device_id is not None: # Creates a configuration object config = { 'gpu': { 'DEVICE_ID': device_id, 'MEMORY_FRACTION': mem_per_process } } return config # If there is no avaliable GPU except Exception as e: logging.warning(e) # Creates a different configuration object config = {'cpu': {}} return config
def gpu_conf(cfg, gpu_id=None): if gpu_id == None: DEVICE_ID_LIST = GPUtil.getFirstAvailable() if (len(DEVICE_ID_LIST) > 0): cfg.GPU_ID = DEVICE_ID_LIST[ 0] # grab first element from list else: cfg.GPU_ID = gpu_id return cfg
def pick_device(): try: GPUtil.showUtilization() # Get the first available GPU DEVICE_ID_LIST = GPUtil.getFirstAvailable() DEVICE_ID = DEVICE_ID_LIST[0] # grab first element from list # Set CUDA_VISIBLE_DEVICES to mask out all other GPUs than the first available device id os.environ["CUDA_VISIBLE_DEVICES"] = str(DEVICE_ID) logging.debug('Device ID (unmasked): ' + str(DEVICE_ID)) except: logging.exception('Cannot detect GPUs')
def processing(path, word_index, input_length, x_train): """Processing string array with pretrained vectors. convert an n dimension string array into n * k * m dimension float numpy array. Each k * m array represents a string. k is the input_length which means an upper bound of the string length, for string shorter than k will be pad and longer string will be cropped. m is defined by the pretrained file. Args: path: String, path where the pre trained files stored. word_index: Dictionary, contains word with tokenlized index. input_length: Int, an upper bound of the string length. x_train: String array. Returns: x_train: Numpy array as processed x_train. """ import tensorflow as tf embedding_matrix = load_pretrain(path=path, word_index=word_index) # Get the first available GPU device_id_list = GPUtil.getFirstAvailable() device_id = device_id_list[0] # grab first element from list # Set CUDA_VISIBLE_DEVICES to mask out all other GPUs than the first available device id os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id) device = '/gpu:0' with tf.device(device): from keras import Input, Model from keras import backend from keras.layers import Embedding config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) backend.set_session(sess) print("generating preprocessing model...") embedding_layer = Embedding(len(word_index) + 1, Constant.EMBEDDING_DIM, weights=[embedding_matrix], input_length=input_length, trainable=False) sequence_input = Input(shape=(input_length,), dtype='int32') embedded_sequences = embedding_layer(sequence_input) model = Model(sequence_input, embedded_sequences) print("converting text to vector...") x_train = model.predict(x_train) del model return x_train
def get_gpufirstavailable(self): """ 根据GPU负载以及显存使用量返回第一个可用GPU_id,当无可用GPU时,将报错 getAvailable参数均可用,含义一致 attempts: 表示无法获取可用GPU时,尝试重复获取次数 interval: 表示每次获取可用GPU时,时间间隔(秒) verbose: 表示在获取到最佳可用GPU时,是否打印尝试次数 """ GPUfirstavailable = GPUtil.getFirstAvailable(order=self.order, attempts=self.attempts, interval=self.interval, verbose=self.verbose) return GPUfirstavailable
def get_gpu_info(): """ :return: """ gpulist = [] GPUtil.showUtilization() # 获取多个GPU信息,存在列表 for gpu in Gpus: print('GPU.id:', gpu.id) print('GPU总量:', gpu.memoryTotal) print('GPU使用量:', gpu.memoryUsed) print('GPU使用占比:', gpu.memoryUtil * 100) print('GPU.id:', gpu.id) # 按GPU逐个添加信息 gpulist.append( [gpu.id, gpu.memoryTotal, gpu.memoryUsed, gpu.memoryUtil * 100]) """ 根据GPU负载以及显存使用量返回可用GPU_id列表 first: 返回的gpu可用id按升序排列 limit: 返回可用GPU的id数量 maxload: GPU负载率最大限制(超过该值,将不会返回) maxMemory: GPU显存使用率最大限制(超过该值,将不会返回) includeNan: 是否包括负载或内存使用为NaN的GPU excludeID: 排除的GPU_id列表 excludeUUID: 类似excludeID,将ID替换成UUID """ GPUavailable = GPUtil.getAvailable(order='first', limit=1, maxLoad=0.5, maxMemory=0.5, includeNan=False, excludeID=[], excludeUUID=[]) gpulist.append(GPUavailable) """ 根据GPU负载以及显存使用量返回第一个可用GPU_id,当无可用GPU时,将报错 getAvailable参数均可用,含义一致 attempts: 表示无法获取可用GPU时,尝试重复获取次数 interval: 表示每次获取可用GPU时,时间间隔(秒) verbose: 表示在获取到最佳可用GPU时,是否打印尝试次数 """ GPUfirstavailable = GPUtil.getFirstAvailable(order='first', attempts=1, interval=900, verbose=False) gpulist.append(GPUfirstavailable) return gpulist
def WaitForGPU(wait=300): GPUavailable = False while not GPUavailable: try: if not 'DEVICE_ID' in locals(): DEVICE_ID = GPUtil.getFirstAvailable()[0] print('Using GPU', DEVICE_ID) os.environ["CUDA_VISIBLE_DEVICES"] = str(DEVICE_ID) GPUavailable = True return except Exception as e: # No GPU available print('Waiting for GPU...') GPUavailable = False time.sleep(wait)
def lauchCorrMutProgram(self, aligFormatedName, ignoreGPU=False): nrows, ncols = self.getAligsDims(aligFormatedName) memoryRequired = 4 * (4 * (ncols * ncols * 32 * 21 + ncols * 20) + 23 * nrows * ncols + nrows + ncols * ncols) + 2 * nrows * ncols + 1024 tmpResults = os.path.basename(aligFormatedName) tmpResults = os.path.join(self.tmp, tmpResults) try: gpuNumber = GPUtil.getFirstAvailable(order='first', maxLoad=0.3, maxMemory=0.3, attempts=2, interval=3)[0] except (RuntimeError, OSError, ValueError): gpuNumber = None wasRunOnGPU = False if not ignoreGPU and memoryRequired * 1.1 < self.getTotalGPUMemory( gpuNumber): #*1.1 as a margin of tolerance cmdArray = [ self.ccmPredBin, "-R", "-d", str(gpuNumber), aligFormatedName, tmpResults ] wasRunOnGPU = True else: cmdArray = [ self.ccmPredBin, "-R", "-t", str(self.corrMutNThrs), aligFormatedName, tmpResults ] print(" ".join(cmdArray)) process = Popen(cmdArray, stdout=PIPE, stderr=PIPE) processOut = process.communicate() try: iterOfCorrelatedRows = self.processOutput(processOut, tmpResults) except ValueError as e: print(e) iterOfCorrelatedRows = None if iterOfCorrelatedRows is None and wasRunOnGPU == True: print("Error running ccmpred on gpu, trying cpu") iterOfCorrelatedRows = self.lauchCorrMutProgram(aligFormatedName, ignoreGPU=True) return iterOfCorrelatedRows
def prepare_environment(resource_limit, log): ''' Prepares the environment by choosing one GPU to run on, adjusts CUDA_VISIBLE_DEVICES env var for TF, sets the max. process length so that the training won't time out.''' try: DEVICE_ID_LIST = GPUtil.getFirstAvailable(order='last', maxLoad=0.85, verbose=True) DEVICE_ID = DEVICE_ID_LIST[0] os.environ["CUDA_VISIBLE_DEVICES"] = str(DEVICE_ID) log('Preparing environment by choosing a gpu {} and setting resource limit={}' .format(DEVICE_ID, resource_limit)) except: print('No GPU found, continuing in CPU mode.') try: soft, hard = resource.getrlimit(resource.RLIMIT_CPU) resource.setrlimit(resource.RLIMIT_CPU, (resource_limit, hard)) except: print('No limit set.')
def create_model(g_conv_dim=64, d_conv_dim=64, n_res_blocks=6): """ Builds the generators and discriminators. """ # Instantiate generators G_XtoY = CycleGenerator(conv_dim=g_conv_dim, n_res_blocks=n_res_blocks) G_YtoX = CycleGenerator(conv_dim=g_conv_dim, n_res_blocks=n_res_blocks) # Instantiate discriminators D_X = Discriminator(conv_dim=d_conv_dim) D_Y = Discriminator(conv_dim=d_conv_dim) device = torch.device(GPUtil.getFirstAvailable()[0] if torch.cuda.is_available() else "cpu") print('device=',device) G_XtoY.to(device) G_YtoX.to(device) D_X.to(device) D_Y.to(device) return G_XtoY, G_YtoX, D_X, D_Y, device
def gpu_setup(gpu_id): #set up GPUS os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" os.environ['CUDA_VISIBLE_DEVICES'] = "" if gpu_id == "auto": try: #try to find empty gpu automaticaly import GPUtil gpu_id = GPUtil.getFirstAvailable(order = 'memory', maxLoad=0.5, maxMemory=0.5, attempts=1, interval=900, verbose=False) gpu_id = gpu_id[0] except: print("can't import GPUtil. maybe you can do: pip install gputil") print("gpu id is set to -1") gpu_id = -1 gpu_id = int(gpu_id) if gpu_id >= 0: os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("gpu id: %s"%gpu_id) print("using device: %s"%device) return device
def configure_gpu_tf(): """ This is an example for how to customise the search for a GPU for a specific job depending on hardware/organisational requirements. In this case, we have a machine with two GPUs on which we want to support three simultaneous GPU jobs (& unlimited CPU). """ try: # locate available devices & set required environment variables available_device_ids = GPUtil.getFirstAvailable(order='first', maxLoad=0.7, maxMemory=0.7, attempts=1, interval=10) available_device_id = available_device_ids[0] os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' os.environ['CUDA_VISIBLE_DEVICES'] = str(available_device_id) print(f"\n GPU Found! running on GPU:{available_device_id}\n") # set GPU configuration (use all GPU memory if device 0, else use <50% of memory) tf.debugging.set_log_device_placement(False) physical_gpu = tf.config.experimental.list_physical_devices('GPU')[0] if available_device_id == 0: tf.config.experimental.set_memory_growth(physical_gpu, True) else: tf.config.experimental.set_virtual_device_configuration( physical_gpu, [ tf.config.experimental.VirtualDeviceConfiguration( memory_limit=4500) ]) logical_gpus = tf.config.experimental.list_logical_devices('GPU') assert len( logical_gpus ) == 1, "error creating virtual GPU to fractionally use memory" # if we can't find a GPU, or they are all busy, default to using CPU except RuntimeError: print("\n No GPUs available... running on CPU\n") os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
def configure_gpu(prefered: Optional[int]) -> None: """ Configure the CUDA GPU (if applicable). """ try: import GPUtil # type: ignore except ImportError: warnings.warn("Could not import GPUtil: using default GPU.") else: # Use an equal slice of the machine as the number of physical cores. # This is usually the ideal number of processes before resource contention. max_resources = 1. - 1. / full_cores_available() limits = dict(maxLoad=max_resources, maxMemory=max_resources) if prefered is None: # Select an available GPU. device_id, = GPUtil.getFirstAvailable( order='random', attempts=2 * 60, # try for an hour to get a GPU interval=30, # Try every 30 seconds. verbose=True, **limits) else: # Get the GPUs with the LEAST memory utilization. available = GPUtil.getAvailable(order='memory', limit=16, **limits) assert len(available) >= 1 if prefered in available: device_id = prefered else: logger.warn("Requested GPU %d unavaible", prefered) # Since the preference is unavailable, use the GPU with the least # allocated memory. device_id = available[0] # Set the prefered GPU ID. logger.info("Using GPU %d", device_id) os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id)
# $ # $ pip install bert-serving-server # $ pip install bert-serving-client # $ # using BertClient inside tf.data API import json import os import time import GPUtil import tensorflow as tf from bert_serving.client import BertClient os.environ['CUDA_VISIBLE_DEVICES'] = str(GPUtil.getFirstAvailable()) train_fp = ['/data/cips/data/larry-autoencoder/cail_0518/data_train.json'] batch_size = 256 num_parallel_calls = 4 num_concurrent_clients = 10 # should be greater than `num_parallel_calls` bc_clients = [ BertClient(show_server_config=False) for _ in range(num_concurrent_clients) ] def get_encodes(x): # x is `batch_size` of lines, each of which is a json object samples = [json.loads(l) for l in x] text = [s['fact'][-50:] for s in samples]
default=-1, type=int, help='Which gpu to use. If -1, determine automatically') args = parser.parse_args() dl_kwargs_train = parse_json_file_str(args.dl_kwargs_train) dl_kwargs_eval = parse_json_file_str(args.dl_kwargs_eval) if args.add_n_hidden == "": hidden = [] else: hidden = [int(x) for x in args.add_n_hidden.split(",")] # ------- odir = Path(args.output) odir.mkdir(parents=True, exist_ok=True) if args.gpu == -1: gpu = GPUtil.getFirstAvailable(attempts=3, includeNan=True)[0] else: gpu = args.gpu create_tf_session(gpu) # Get the model and the dataloader model = kipoi.get_model(args.model, args.source) if args.dataloader is not None: Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source) else: Dl = model.default_dataloader if not model.type == "keras": raise ValueError("Only keras models are supported")
from __future__ import print_function import numpy as np from six.moves import range import h5py import scipy.io as io import sys, os import itertools as it import time import GPUtil #from api.resources.preprocessing.DeepVess.TrainDeepVess import train_deep_vess #from api.resources.preprocessing.DeepVess.DeepVessModel import define_deepvess_architecture os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" deviceID = GPUtil.getFirstAvailable(order='random', maxLoad=0.7, maxMemory=0.6, attempts=10, interval=100, verbose=True) os.environ["CUDA_VISIBLE_DEVICES"] = str(deviceID[0]) import tensorflow as tf def start_tracing_model(inputData, isTrain=False, isForward=True, padSize=((3, 3), (16, 16), (16, 16), (0, 0))): """ :param inputData: :param isTrain: Change isTrain to True if you want to train the network :param isForward: Change isForward to True if you want to test the network
import GPUtil import datetime import os DEVICE_ID_LIST = GPUtil.getFirstAvailable() DEVICE_ID = DEVICE_ID_LIST[0] os.environ["CUDA_VISIBLE_DEVICES"] = str(DEVICE_ID) import argparse import gym import os import sys import pickle import time sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from utils import * from models.mlp_policy import Policy from models.mlp_critic import Value from models.mlp_policy_disc import DiscretePolicy from torch.autograd import Variable from core.ppo import ppo_step from core.common import estimate_advantages from core.agent import Agent Tensor = DoubleTensor torch.set_default_tensor_type('torch.DoubleTensor') parser = argparse.ArgumentParser(description='PyTorch PPO example') parser.add_argument('--env-name', default="Reacher-v1", metavar='G', help='name of the environment to run')
def main(mode: RTERunPhase, config=None, estimator=None): LogHelper.setup() logger = LogHelper.get_logger( os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode)) if config is not None and isinstance(config, str): logger.info("model: " + str(mode) + ", config: " + str(config)) Config.load_config(config) if hasattr(Config, 'use_inter_evidence_comparison'): use_inter_evidence_comparison = Config.use_inter_evidence_comparison else: use_inter_evidence_comparison = False # 'esim_inter_evidence' model and 'esim_inter_evidence_claim_evidences_comparison' models need inter evidence inputs use_inter_evidence_comparison = use_inter_evidence_comparison or Config.estimator_name in { 'esim_inter_evidence', 'esim_inter_evidence_claim_evidences_comparison' } if hasattr(Config, 'use_claim_evidences_comparison'): use_claim_evidences_comparison = Config.use_claim_evidences_comparison else: use_claim_evidences_comparison = False # 'esim_inter_evidence_claim_evidences_comparison' model needs claim-evidence inputs use_claim_evidences_comparison = use_claim_evidences_comparison or Config.estimator_name in { 'esim_inter_evidence_claim_evidences_comparison' } if hasattr(Config, 'use_extra_features'): use_extra_features = Config.use_extra_features else: use_extra_features = False if hasattr(Config, 'use_numeric_feature'): use_numeric_feature = Config.use_numeric_feature else: use_numeric_feature = False # 'esim_num_feature' model needs numeric feature inputs use_numeric_feature = use_numeric_feature or Config.estimator_name in { 'esim_num_feature' } if hasattr(Config, 'is_snopes'): is_snopes = Config.is_snopes else: is_snopes = False logger.debug("is_snopes: " + str(is_snopes)) logger.info("scorer type: " + Config.estimator_name) logger.info("random seed: " + str(Config.seed)) logger.info("ESIM arguments: " + str(Config.esim_end_2_end_hyper_param)) logger.info("use_inter_sentence_comparison: " + str(use_inter_evidence_comparison)) logger.info("use_extra_features: " + str(use_extra_features)) logger.info("use_numeric_feature: " + str(use_numeric_feature)) logger.info("use_claim_evidences_comparison: " + str(use_claim_evidences_comparison)) if mode == RTERunPhase.train: # # training mode if hasattr(Config, 'training_dump') and os.path.exists( Config.training_dump): with open(Config.training_dump, 'rb') as f: (X_dict, y_train) = pickle.load(f) else: training_set, vocab, embeddings, _, _ = embed_data_set_with_glove_2( Config.training_set_file, Config.db_path, glove_path=Config.glove_path, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size, is_snopes=is_snopes) h_sent_sizes = training_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) training_set['data']['h_sent_sizes'] = np.expand_dims( h_sent_sizes, 1) training_set['data']['h_sizes'] = h_sizes training_set['data']['h_np'] = np.expand_dims( training_set['data']['h_np'], 1) valid_set, _, _, _, _ = embed_data_set_with_glove_2( Config.dev_set_file, Config.db_path, vocab_dict=vocab, glove_embeddings=embeddings, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size, is_snopes=is_snopes) h_sent_sizes = valid_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) valid_set['data']['h_sizes'] = h_sizes valid_set['data']['h_np'] = np.expand_dims( valid_set['data']['h_np'], 1) if use_extra_features: assert hasattr( Config, 'feature_path' ), "Config should has feature_path if Config.use_feature is True" training_claim_features, training_evidence_features = load_feature_by_data_set( Config.training_set_file, Config.feature_path, Config.max_sentences) valid_claim_features, valid_evidence_features = load_feature_by_data_set( Config.dev_set_file, Config.feature_path, Config.max_sentences) training_set['data']['h_feats'] = training_claim_features training_set['data']['b_feats'] = training_evidence_features valid_set['data']['h_feats'] = valid_claim_features valid_set['data']['b_feats'] = valid_evidence_features if use_numeric_feature: training_num_feat = number_feature(Config.training_set_file, Config.db_path, Config.max_sentences, is_snopes) valid_num_feat = number_feature(Config.dev_set_file, Config.db_path, Config.max_sentences, is_snopes) training_set['data']['num_feat'] = training_num_feat valid_set['data']['num_feat'] = valid_num_feat if use_inter_evidence_comparison: training_concat_sent_indices, training_concat_sent_sizes = generate_concat_indices_for_inter_evidence( training_set['data']['b_np'], training_set['data']['b_sent_sizes'], Config.max_sentence_size, Config.max_sentences) training_set['data'][ 'b_concat_indices'] = training_concat_sent_indices training_set['data'][ 'b_concat_sizes'] = training_concat_sent_sizes valid_concat_sent_indices, valid_concat_sent_sizes = generate_concat_indices_for_inter_evidence( valid_set['data']['b_np'], valid_set['data']['b_sent_sizes'], Config.max_sentence_size, Config.max_sentences) valid_set['data'][ 'b_concat_indices'] = valid_concat_sent_indices valid_set['data']['b_concat_sizes'] = valid_concat_sent_sizes if use_claim_evidences_comparison: training_all_evidences_indices, training_all_evidences_sizes = generate_concat_indices_for_claim( training_set['data']['b_np'], training_set['data']['b_sent_sizes'], Config.max_sentence_size, Config.max_sentences) training_set['data'][ 'b_concat_indices_for_h'] = training_all_evidences_indices training_set['data'][ 'b_concat_sizes_for_h'] = training_all_evidences_sizes valid_all_evidences_indices, valid_all_evidences_sizes = generate_concat_indices_for_claim( valid_set['data']['b_np'], valid_set['data']['b_sent_sizes'], Config.max_sentence_size, Config.max_sentences) valid_set['data'][ 'b_concat_indices_for_h'] = valid_all_evidences_indices valid_set['data'][ 'b_concat_sizes_for_h'] = valid_all_evidences_sizes X_dict = { 'X_train': training_set['data'], 'X_valid': valid_set['data'], 'y_valid': valid_set['label'], 'embedding': embeddings } y_train = training_set['label'] if hasattr(Config, 'training_dump'): with open(Config.training_dump, 'wb') as f: pickle.dump((X_dict, y_train), f, protocol=pickle.HIGHEST_PROTOCOL) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) estimator.fit(X_dict, y_train) save_model(estimator, Config.model_folder, Config.pickle_name, logger) else: # testing mode restore_param_required = estimator is None if estimator is None: estimator = load_model(Config.model_folder, Config.pickle_name) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) vocab, embeddings = load_whole_glove(Config.glove_path) vocab = vocab_map(vocab) test_set, _, _, _, _ = embed_data_set_with_glove_2( Config.test_set_file, Config.db_path, vocab_dict=vocab, glove_embeddings=embeddings, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size, is_snopes=is_snopes) h_sent_sizes = test_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) test_set['data']['h_sizes'] = h_sizes test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1) if use_extra_features: assert hasattr( Config, 'feature_path' ), "Config should has feature_path if Config.use_feature is True" test_claim_features, test_evidence_features = load_feature_by_data_set( Config.test_set_file, Config.feature_path, Config.max_sentences) test_set['data']['h_feats'] = test_claim_features test_set['data']['b_feats'] = test_evidence_features if use_numeric_feature: test_num_feat = number_feature(Config.test_set_file, Config.db_path, Config.max_sentences, is_snopes) test_set['data']['num_feat'] = test_num_feat x_dict = {'X_test': test_set['data'], 'embedding': embeddings} if use_inter_evidence_comparison: test_concat_sent_indices, test_concat_sent_sizes = generate_concat_indices_for_inter_evidence( test_set['data']['b_np'], test_set['data']['b_sent_sizes'], Config.max_sentence_size, Config.max_sentences) test_set['data']['b_concat_indices'] = test_concat_sent_indices test_set['data']['b_concat_sizes'] = test_concat_sent_sizes if use_claim_evidences_comparison: test_all_evidences_indices, test_all_evidences_sizes = generate_concat_indices_for_claim( test_set['data']['b_np'], test_set['data']['b_sent_sizes'], Config.max_sentence_size, Config.max_sentences) test_set['data'][ 'b_concat_indices_for_h'] = test_all_evidences_indices test_set['data']['b_concat_sizes_for_h'] = test_all_evidences_sizes if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) predictions = estimator.predict( x_dict, restore_param_required=restore_param_required) generate_submission(predictions, test_set['id'], Config.test_set_file, Config.submission_file) if 'label' in test_set: print_metrics(test_set['label'], predictions, logger) return estimator
# optimize every params that require grad if optimizer_name == 'Adam': optimizer = torch.optim.Adam(filter( lambda p: p.requires_grad, model.parameters()), lr=opt_lr) # attempt to sent to GPU, else train over CPU model_sent_to_device = False sleep_time = 30 while not model_sent_to_device and sleep_time < 4800: # get free device device = torch.device('cuda') try: device_id = GPUtil.getFirstAvailable(order='memory', maxLoad=1.0, maxMemory=0.8, verbose=False)[0] # send to least used GPU print('Using GPU:', device_id) with torch.cuda.device(device_id): model.to(device) model_sent_to_device = True except Exception as e: print(e) sleep_time = 1.66 * sleep_time print('GPU error. Wait {}s and continue'.format( sleep_time)) time.sleep(sleep_time) if not model_sent_to_device:
def main(mode: RTERunPhase, config=None, estimator=None): LogHelper.setup() logger = LogHelper.get_logger( os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode)) if config is not None and isinstance(config, str): logger.info("model: " + str(mode) + ", config: " + str(config)) Config.load_config(config) if hasattr(Config, 'is_snopes'): is_snopes = Config.is_snopes else: is_snopes = False logger.debug("is_snopes: " + str(is_snopes)) if mode == RTERunPhase.train: # training mode if hasattr(Config, 'training_dump') and os.path.exists( Config.training_dump): with open(Config.training_dump, 'rb') as f: (X_train, Y_labels_train, X_valid, Y_labels_valid) = pickle.load(f) else: # process training JSONL file X_train, Y_labels_train = read_data_set_from_jsonl( Config.training_set_file, Config.db_path, num_sentences=Config.max_sentences, is_snopes=is_snopes) X_valid, Y_labels_valid = read_data_set_from_jsonl( Config.dev_set_file, Config.db_path, num_sentences=Config.max_sentences, is_snopes=is_snopes) b_train = X_train['b'] X_train['b_sizes'] = get_num_sents_of_bodies(b_train) for i, sample in enumerate(b_train): if len(sample) < Config.max_sentences: for _ in range(Config.max_sentences - len(sample)): sample.append(" ") b_train[i] = np.asarray(sample) b_train = np.asarray(b_train) X_train['b'] = b_train logger.debug("b_train.shape: " + str(b_train.shape)) b_valid = X_valid['b'] X_valid['b_sizes'] = get_num_sents_of_bodies(b_valid) for i, sample in enumerate(b_valid): if len(sample) < Config.max_sentences: for _ in range(Config.max_sentences - len(sample)): sample.append(" ") b_valid[i] = np.asarray(sample) b_valid = np.asarray(b_valid) X_valid['b'] = b_valid logger.debug("b_valid.shape: " + str(b_valid.shape)) if hasattr(Config, 'training_dump'): with open(Config.training_dump, 'wb') as f: pickle.dump( (X_train, Y_labels_train, X_valid, Y_labels_valid), f, protocol=pickle.HIGHEST_PROTOCOL) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) estimator.fit(X_train, Y_labels_train, X_valid, Y_labels_valid) save_model(estimator, Config.model_folder, Config.pickle_name, logger) else: # testing mode restore_param_required = estimator is None if estimator is None: estimator = load_model(Config.model_folder, Config.pickle_name) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) X_test, Y_labels_test = read_data_set_from_jsonl( Config.test_set_file, Config.db_path, num_sentences=Config.max_sentences, is_snopes=is_snopes) b_test = X_test['b'] X_test['b_sizes'] = get_num_sents_of_bodies(b_test) for i, sample in enumerate(b_test): if len(sample) < Config.max_sentences: for _ in range(Config.max_sentences - len(sample)): sample.append(" ") b_test[i] = np.asarray(sample) b_test = np.asarray(b_test) X_test['b'] = b_test logger.debug("b_test.shape: " + str(b_test.shape)) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) predictions = estimator.predict(X_test, restore_param_required) generate_submission(predictions, X_test['id'], Config.test_set_file, Config.submission_file) if Y_labels_test: print_metrics(Y_labels_test, predictions, logger) return estimator
import torch import os import GPUtil from data_load import data_load as DL import numpy as np devices = "%d" % GPUtil.getFirstAvailable(order="memory")[0] os.environ["CUDA_VISIBLE_DEVICES"] = str(devices) # torch.cuda.manual_seed(1234) # l = torch.cuda.get_rng_state() # torch.bernoulli(torch.full((5,5), 0.5, device='cuda')) # # torch.cuda.manual_seed(1234) # l2 = torch.cuda.get_rng_state() # # (l==l2).all().item() np.random.seed(1234) torch.manual_seed(1234) torch.cuda.manual_seed(1234) torch.cuda.manual_seed_all(1234) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False n_sample = 10 n_dim = 3 data = np.random.randint((50), size=(n_sample, n_dim)) label = np.random.randint((3), size=(n_sample,)) train_loader = DL.convert_Dloader(5, data, label, is_training=True, num_workers=0, shuffle=True) for epoch in range(3):
def main(mode: RTERunPhase, config=None, estimator=None): LogHelper.setup() logger = LogHelper.get_logger( os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode)) if config is not None and isinstance(config, str): logger.info("model: " + str(mode) + ", config: " + str(config)) Config.load_config(config) logger.info("scorer type: " + Config.estimator_name) logger.info("random seed: " + str(Config.seed)) logger.info("ESIM arguments: " + str(Config.esim_hyper_param)) logger.info("this script is only for FEVER dataset") if mode == RTERunPhase.train: # # training mode if hasattr(Config, 'training_dump') and os.path.exists( Config.training_dump): with open(Config.training_dump, 'rb') as f: (X_dict, y_train) = pickle.load(f) else: training_set, vocab, embeddings, _, _ = embed_data_set_with_glove_2( Config.training_set_file, Config.db_path, glove_path=Config.glove_path, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size) h_sent_sizes = training_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) training_set['data']['h_sent_sizes'] = np.expand_dims( h_sent_sizes, 1) training_set['data']['h_sizes'] = h_sizes training_set['data']['h_np'] = np.expand_dims( training_set['data']['h_np'], 1) training_set['data']['scores'] = load_scores( Config.training_set_file, Config.max_sentences) valid_set, _, _, _, _ = embed_data_set_with_glove_2( Config.dev_set_file, Config.db_path, vocab_dict=vocab, glove_embeddings=embeddings, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size) h_sent_sizes = valid_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) valid_set['data']['h_sizes'] = h_sizes valid_set['data']['h_np'] = np.expand_dims( valid_set['data']['h_np'], 1) valid_set['data']['scores'] = load_scores(Config.dev_set_file, Config.max_sentences) X_dict = { 'X_train': training_set['data'], 'X_valid': valid_set['data'], 'y_valid': valid_set['label'], 'embedding': embeddings } y_train = training_set['label'] if hasattr(Config, 'training_dump'): with open(Config.training_dump, 'wb') as f: pickle.dump((X_dict, y_train), f, protocol=pickle.HIGHEST_PROTOCOL) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) estimator.fit(X_dict, y_train) save_model(estimator, Config.model_folder, Config.pickle_name, logger) else: # testing mode restore_param_required = estimator is None if estimator is None: estimator = load_model(Config.model_folder, Config.pickle_name) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) vocab, embeddings = load_whole_glove(Config.glove_path) vocab = vocab_map(vocab) test_set, _, _, _, _ = embed_data_set_with_glove_2( Config.test_set_file, Config.db_path, vocab_dict=vocab, glove_embeddings=embeddings, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size) h_sent_sizes = test_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) test_set['data']['h_sizes'] = h_sizes test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1) test_set['data']['scores'] = load_scores(Config.test_set_file, Config.max_sentences) x_dict = {'X_test': test_set['data'], 'embedding': embeddings} if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) predictions = estimator.predict( x_dict, restore_param_required=restore_param_required) generate_submission(predictions, test_set['id'], Config.test_set_file, Config.submission_file) if 'label' in test_set: print_metrics(test_set['label'], predictions, logger) return estimator
def bestGPU(): try: return GPUtil.getFirstAvailable() except RuntimeError: return [random.randint(0, 7)]
''' Script that selects first available GPU''' import os import GPUtil devicesid=GPUtil.getFirstAvailable() os.environ["CUDA_VISIBLE_DEVICES"] = str(devicesid[0]) os.environ['TF_CPP_MIN_LOG_LEVEL'] = "1" print('GPU %d was selected' %devicesid[0])
print(GPU.getAvailable(order='last', limit=1)) # Get 1 random available GPU print('Random available: '), print(GPU.getAvailable(order='random')) # Get 1 available GPU, ordered by GPU load ascending print('First available weighted by GPU load ascending: '), print(GPU.getAvailable(order='load', limit=1)) # Get all available GPU with max load of 10%, ordered by memory ascending print('All available weighted by memory load ascending: '), print(GPU.getAvailable(order='memory', limit=999, maxLoad=0.1)) # Get the first available GPU firstGPU = GPU.getFirstAvailable() print('First available GPU id:'), print(firstGPU) # Get the first available GPU, where memory usage is less than 90% and processing is less than 80% firstGPU = GPU.getFirstAvailable(maxMemory=0.9, maxLoad=0.8) print('First available GPU id (memory < 90%, load < 80%):'), print(firstGPU) # Get the first available GPU, where processing is less than 1% firstGPU = GPU.getFirstAvailable(attempts=5, interval=5, maxLoad=0.01, verbose=True) print('First available GPU id (load < 1%):'), print(firstGPU)
def train(config): cwd_slash = gen_cwd_slash(config) os.makedirs(config['_cwd'], exist_ok=True) if config['cuda_visible_devices'] is not None: debug(f"Using GPU: {config['cuda_visible_devices']}") os.environ['CUDA_VISIBLE_DEVICES'] = config['cuda_visible_devices'] else: avail_gpu = str(GPUtil.getFirstAvailable()[0]) debug(f"Selecting the first available GPU: {avail_gpu}") os.environ['CUDA_VISIBLE_DEVICES'] = avail_gpu train_windowed_anno = pd.read_csv(config['path_to_train_windowed_anno_cache'], index_col=0) valid_windowed_anno = pd.read_csv(config['path_to_valid_windowed_anno_cache'], index_col=0) train_balanced_generator = data_gen_from_anno_gen( gen_even_batches( train_windowed_anno, config, target_col='corrected_target', ), config, target_col='corrected_target', do_augment=True, ) if config['n_batches_preview'] > 0: debug('preview_generator ...') preview_generator( train_balanced_generator, config, filename_prefix=f"train_balanced_generator_{'_'.join([str(x) for x in config['class_ids']])}", n_batches=config['n_batches_preview'], ) debug('preview_generator done') train_generator = data_gen_from_anno_gen( batching_row_gen(randomize_and_loop(train_windowed_anno), config['batch_size']), config, target_col='corrected_target', do_augment=True, ) if config['n_batches_preview'] > 0: debug('preview_generator ...') preview_generator( train_generator, config, filename_prefix=f"train_generator_{'_'.join([str(x) for x in config['class_ids']])}", n_batches=config['n_batches_preview'], ) debug('preview_generator done') valid_balanced_generator = data_gen_from_anno_gen( gen_even_batches( valid_windowed_anno, config, target_col='corrected_target', ), config, target_col='corrected_target', do_augment=False, ) if config['n_batches_preview'] > 0: debug('preview_generator ...') preview_generator( valid_balanced_generator, config, filename_prefix=f"valid_balanced_generator_{'_'.join([str(x) for x in config['class_ids']])}", n_batches=config['n_batches_preview'], ) debug('preview_generator done') device = 'cuda' log_interval = 1 train_balanced_generator = numpy_to_pytorch(train_balanced_generator) train_balanced_loader = ChunkIter(train_balanced_generator, config['steps_per_epoch']) train_generator = numpy_to_pytorch(train_generator) train_loader = ChunkIter(train_generator, config['steps_per_epoch']) valid_balanced_generator = numpy_to_pytorch(valid_balanced_generator) val_loader = ChunkIter(valid_balanced_generator, config['steps_per_epoch_for_valid']) def debug_hook(module, input_, output): debug(f"input_ = {input_}") debug(f"output = {output}") model = MyModel(config) model.to(device) path_to_model_checkpoint = cwd_slash('model_best_ravg_loss.pth') if os.path.exists(path_to_model_checkpoint): debug(f"loading model checkpoint from {path_to_model_checkpoint}") model_state_dict = torch.load(path_to_model_checkpoint) model.load_state_dict(model_state_dict) optimizer = torch.optim.Adam(model.parameters(), lr=config['starting_lr']) path_to_optimizer_checkpoint = cwd_slash('optimizer_best_ravg_loss.pth') if os.path.exists(path_to_optimizer_checkpoint): debug(f"loading optimizer checkpoint from {path_to_optimizer_checkpoint}") optimizer_state_dict = torch.load(path_to_optimizer_checkpoint) optimizer.load_state_dict(optimizer_state_dict) trainer = create_supervised_trainer(model, optimizer, F.binary_cross_entropy, device=device) RunningAverage(alpha=0.99).attach(trainer, 'ravg_loss') epoch_timer = Timer().attach(trainer, start=Events.EPOCH_STARTED) metrics = { 'acc': Accuracy(), 'val_macro_f1': MacroF1(), # 'nll': Loss(F.binary_cross_entropy), } evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) pbar = tqdm(initial=0, leave=False, total=config['steps_per_epoch'], mininterval=0.1) @trainer.on(Events.STARTED) def started_handler(engine): info("started_handler()") engine.state.last_ravg_loss = math.inf engine.state.best_val_macro_f1 = 0 engine.state.best_ravg_loss = math.inf engine.state.lr = config['starting_lr'] engine.state.n_restarts = 0 engine.state.ravg_loss_improved = False engine.state.val_macro_f1_improved = False def format_log_header(fields): output_groups = [] output_group = [] for field in fields: name = field.get('name') if name is None: output_groups.append(output_group) output_group = [] continue display_str = name width = field.get('width') if type(width) is int: if int < 0: display_str.ljust(-width) else: display_str.rjust(width) output_group.append(display_str) output_groups.append(output_group) return ' | '.join([' '.join(g) for g in output_groups]) @trainer.on(Events.ITERATION_COMPLETED) def iteration_completed_handler(engine): iter = (engine.state.iteration - 1) % config['steps_per_epoch'] + 1 if iter % log_interval == 0: pbar.set_description_str( ' | '.join( [ # "class_ids " + str(config['class_ids']).rjust(4), "fold " + str(config['i_fold']), "epoch " + str(engine.state.epoch).rjust(2), "ravg_loss " + f"{trainer.state.metrics['ravg_loss']:.6f}", "loss " + f"{engine.state.output:.4f}", ] ) ) pbar.update(log_interval) max_n_restarts = 10 @trainer.on(Events.EPOCH_COMPLETED) def epoch_completed_handler(engine): pbar.refresh() events = [] # evaluator.run(train_balanced_loader) evaluator.run(val_loader) val_metrics = evaluator.state.metrics val_macro_f1 = val_metrics['val_macro_f1']['score'].item() score_details = val_metrics['val_macro_f1']['details'] precisions = score_details['precision'] recalls = score_details['recall'] # log_record = { # "epoch": engine.state.epoch, # "ravg_loss": engine.state.metrics['ravg_loss'], # "val_avg_acc": val_metrics['acc'], # "val_macro_f1": val_macro_f1, # "epoch_time": epoch_timer.value(), # "engine.state.lr": engine.state.lr, # } os.makedirs(cwd_slash('macro_f1_details'), exist_ok=True) if engine.state.last_ravg_loss - engine.state.metrics['ravg_loss'] < 0.025 * engine.state.last_ravg_loss: engine.state.ravg_loss_improved = False engine.state.lr *= 0.3 if engine.state.lr < 5e-6: model_state_save_path = cwd_slash(f"model_restart_{engine.state.n_restarts}.pth") torch.save(model.state_dict(), model_state_save_path) debug(f"saved {model_state_save_path}") model_softlink_path = cwd_slash(f"model.pth") debug(f"overwriting soft link {model_softlink_path} --> {model_state_save_path}") if os.path.islink(model_softlink_path): os.unlink(model_softlink_path) os.symlink( os.path.relpath(model_state_save_path, cwd_slash()), model_softlink_path, target_is_directory=True, ) optimizer_state_save_path = cwd_slash(f"optimizer_restart_{engine.state.n_restarts}.pth") torch.save(optimizer.state_dict(), optimizer_state_save_path) debug(f"saved {optimizer_state_save_path}") optimizer_softlink_path = cwd_slash(f"optimizer.pth") debug(f"overwriting soft link {optimizer_softlink_path} --> {optimizer_state_save_path}") if os.path.islink(optimizer_softlink_path): os.unlink(optimizer_softlink_path) os.symlink( os.path.relpath(optimizer_state_save_path, cwd_slash()), optimizer_softlink_path, target_is_directory=True, ) engine.state.n_restarts += 1 if engine.state.n_restarts > max_n_restarts: engine.terminate() events.append(f"max restarts reached") else: engine.state.last_ravg_loss = math.inf engine.state.lr = config['starting_lr'] events.append(f"lr reset to {engine.state.lr:.1e}") else: events.append(f"lr -> {engine.state.lr:.1e}") for g in optimizer.param_groups: g['lr'] = engine.state.lr else: engine.state.ravg_loss_improved = True if engine.state.metrics['ravg_loss'] < engine.state.best_ravg_loss: engine.state.best_ravg_loss = engine.state.metrics['ravg_loss'] debug(f"saved {cwd_slash('model_best_ravg_loss.pth')}") debug(f"saved {cwd_slash('optimizer_best_ravg_loss.pth')}") torch.save(model.state_dict(), cwd_slash('model_best_ravg_loss.pth')) torch.save(optimizer.state_dict(), cwd_slash('optimizer_best_ravg_loss.pth')) if val_macro_f1 > engine.state.best_val_macro_f1: engine.state.val_macro_f1_improved = True engine.state.best_val_macro_f1 = val_macro_f1 debug(f"saved {cwd_slash('model_best_val_f1.pth')}") debug(f"saved {cwd_slash('optimizer_best_val_f1.pth')}") torch.save(model.state_dict(), cwd_slash('model_best_val_f1.pth')) torch.save(optimizer.state_dict(), cwd_slash('optimizer_best_val_f1.pth')) else: engine.state.val_macro_f1_improved = False log_record = [ # { # 'name': 'class_labels', # 'value': class_ids_to_label(config['class_ids'], config), # 'width': -32, # }, # { # # ------------------ # }, { 'name': 'fold', 'value': config['i_fold'], 'width': 1, }, { # ------------------ }, { 'name': 'epoch', 'value': engine.state.epoch, 'width': 3, }, { # ------------------ }, { 'name': 'ravg_loss', 'value': engine.state.metrics['ravg_loss'], 'display': "{:.6f}", 'width': -9, 'color': 'yellow' if engine.state.ravg_loss_improved else None, }, { 'name': 'val_avg_acc', 'value': val_metrics['acc'], 'display': "{:.4f}", 'width': 6, }, { # ------------------ }, { 'name': 'val_macro_f1', 'value': val_macro_f1, 'display': "{:.6f}", 'width': -9, 'color': 'blue' if engine.state.val_macro_f1_improved else None, }, { 'name': 'precision', 'value': float(precisions[0]), 'display': "{:.4f}", 'width': 6, }, { 'name': 'recall', 'value': float(recalls[0]), 'display': "{:.4f}", 'width': 6, }, { # ------------------ }, { 'name': "epoch_time", 'value': epoch_timer.value(), 'display': lambda x: timedelta(seconds=x), 'width': 15, }, { # ------------------ }, { 'name': "lr", 'value': engine.state.lr, 'display': "{:.1e}", 'width': 7, }, { # ------------------ }, { 'name': "cache", 'value': load_img.cache_info(), 'width': 18, }, { # ------------------ }, { 'name': "events", 'value': '; '.join(events), }, ] def format_log_record(fields): output_groups = [] output_group = [] for field in fields: name = field.get('name') if name is None: output_groups.append(output_group) output_group = [] continue value = field.get('value') display = field.get('display') if type(display) is str: display_str = display.format(value) elif callable(display): display_str = str(display(value)) else: display_str = str(value) width = field.get('width') if type(width) is int: if width < 0: display_str = display_str.ljust(-width) else: display_str = display_str.rjust(width) color = field.get('color') if type(color) is str: display_str = colors.color(display_str, fg=color) output_group.append(display_str) output_groups.append(output_group) return ' | '.join([' '.join(g) for g in output_groups]) with open(cwd_slash('log.json'), 'a') as f: obj = {x['name']: x['value'] for x in log_record if type(x.get('name')) is str} json.dump(obj, f) f.write('\n') with open(cwd_slash('displayed_log.txt'), 'a') as f: f.write(format_log_record(log_record)) f.write('\n') tqdm.write(format_log_record(log_record)) load_img.reset_cache_info() macro_f1_df = format_macro_f1_details(val_metrics['val_macro_f1']['details'], config) macro_f1_df.to_csv(cwd_slash('macro_f1_details', f"epoch{engine.state.epoch:03d}_{val_macro_f1}.csv")) tqdm.write(repr(macro_f1_df)) engine.state.last_ravg_loss = engine.state.metrics['ravg_loss'] pbar.n = pbar.last_print_n = 0 pbar.refresh() # banner('start training train_balanced_loader') trainer.run(train_balanced_loader, max_epochs=config['n_epochs']) # trainer.should_terminate = False # max_n_restarts = 10 # banner('start training train_loader') # trainer.run(train_loader, max_epochs=config['n_epochs']) return {'id_': config['class_ids']}
import numpy as np from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau from keras.optimizers import Adam from natsort import natsorted from Datagen import PngDataGenerator from Losses import dice_coef_loss from Models import BlockModel2D from sklearn.model_selection import train_test_split rng = np.random.RandomState(seed=1) import GPUtil try: if not 'DEVICE_ID' in locals(): DEVICE_ID = GPUtil.getFirstAvailable()[0] print('Using GPU', DEVICE_ID) os.environ["CUDA_VISIBLE_DEVICES"] = str(DEVICE_ID) except Exception as e: raise ('No GPU available') train_datapath = '/data/Kaggle/train-png' train_mask_path = '/data/Kaggle/train-mask' weight_filepath = 'Kaggle_Weights.{epoch:02d}-{val_loss:.4f}.h5' pretrain_weights_filepath = 'Best_Kaggle_Weights.02-0.61.h5' # pretrain_weights_filepath = None # parameters im_dims = (512, 512) n_channels = 1
"""By Importing this file, a certain configured GPU will be written into an environment variable that will be read by tensorflow to select a GPU.""" import preprocessing.config as cfg import os if cfg.keras_cfg['set_gpu_device']: os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" if cfg.keras_cfg['gpu_auto_set']: import GPUtil try: os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable()[0]) except: pass else: os.environ['CUDA_VISIBLE_DEVICES'] = cfg.keras_cfg['gpu_device'] elif 'CUDA_VISIBLE_DEVICES' in os.environ: del os.environ['CUDA_VISIBLE_DEVICES']
from src import utils from src.utils import Logger from src.utils import DEFINE_boolean from src.utils import DEFINE_float from src.utils import DEFINE_integer from src.utils import DEFINE_string from src.utils import print_user_flags from src.cifar10.data_utils import read_data, read_data_corrupt_label from src.cifar10.general_controller import GeneralController from src.cifar10.general_child import GeneralChild from src.cifar10.micro_controller import MicroController from src.cifar10.micro_child import MicroChild deviceIDs = GPUtil.getFirstAvailable() print('Available GPU: {}'.format(deviceIDs)) os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(map(str, deviceIDs)) flags = tf.app.flags FLAGS = flags.FLAGS DEFINE_boolean("reset_output_dir", False, "Delete output_dir if exists.") DEFINE_string("data_path", "", "") DEFINE_string("output_dir", "", "") DEFINE_string("data_format", "NHWC", "'NHWC' or 'NCWH'") DEFINE_string("search_for", None, "Must be [macro|micro]") DEFINE_integer("batch_size", 32, "") DEFINE_integer("num_epochs", 300, "")
def main(mode: RTERunPhase, config=None, estimator=None): LogHelper.setup() logger = LogHelper.get_logger( os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode)) if config is not None and isinstance(config, str): logger.info("model: " + str(mode) + ", config: " + str(config)) Config.load_config(config) logger.info("scorer type: " + Config.estimator_name) logger.info("random seed: " + str(Config.seed)) logger.info("ESIM arguments: " + str(Config.esim_hyper_param)) if hasattr(Config, 'is_snopes'): is_snopes = Config.is_snopes else: is_snopes = False logger.debug("is_snopes: " + str(is_snopes)) if mode == RTERunPhase.train: # training mode if hasattr(Config, 'training_dump') and os.path.exists( Config.training_dump): with open(Config.training_dump, 'rb') as f: dataset_list = pickle.load(f) else: # process training JSONL file training_set, _, _ = embed_data_set_for_elmo( Config.training_set_file, Config.db_path, threshold_b_sent_num=Config.max_sentences, threshold_h_sent_size=Config.max_claim_size, threshold_b_sent_size=Config.max_sentence_size, is_snopes=is_snopes) h_sent_sizes = training_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) training_set['data']['h_sent_sizes'] = np.expand_dims( h_sent_sizes, 1) training_set['data']['h_sizes'] = h_sizes training_set['data']['h_tokens'] = np.expand_dims( training_set['data']['h_tokens'], 1) # training_set['data']['h_ft_np'] = np.expand_dims(training_set['data']['h_ft_np'], 1) valid_set, _, _ = embed_data_set_for_elmo( Config.dev_set_file, Config.db_path, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size, is_snopes=is_snopes) h_sent_sizes = valid_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) valid_set['data']['h_sizes'] = h_sizes valid_set['data']['h_tokens'] = np.expand_dims( valid_set['data']['h_tokens'], 1) dataset_list = [training_set, valid_set] # save processed training data if hasattr(Config, 'training_dump'): with open(Config.training_dump, 'wb') as f: pickle.dump(dataset_list, f, protocol=pickle.HIGHEST_PROTOCOL) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) estimator.fit(dataset_list[0]['data'], dataset_list[0]['label'], dataset_list[1]['data'], dataset_list[1]['label']) save_model(estimator, Config.model_folder, Config.pickle_name, logger) else: # testing mode restore_param_required = estimator is None if estimator is None: estimator = load_model(Config.model_folder, Config.pickle_name) if estimator is None: estimator = get_estimator(Config.estimator_name, Config.ckpt_folder) test_set, _, _ = embed_data_set_for_elmo( Config.test_set_file, Config.db_path, threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size, threshold_h_sent_size=Config.max_claim_size, is_snopes=is_snopes) h_sent_sizes = test_set['data']['h_sent_sizes'] h_sizes = np.ones(len(h_sent_sizes), np.int32) test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1) test_set['data']['h_sizes'] = h_sizes test_set['data']['h_tokens'] = np.expand_dims( test_set['data']['h_tokens'], 1) if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str( os.environ['CUDA_VISIBLE_DEVICES']).strip(): os.environ['CUDA_VISIBLE_DEVICES'] = str( GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0]) logger.debug("CUDA_VISIBLE_DEVICES: " + os.environ['CUDA_VISIBLE_DEVICES']) predictions = estimator.predict( test_set['data'], restore_param_required=restore_param_required) generate_submission(predictions, test_set['id'], Config.test_set_file, Config.submission_file) if 'label' in test_set: print_metrics(test_set['label'], predictions, logger) return estimator