예제 #1
0
    def get_device(self):
        """Gathers an avaliable GPU or CPU for further processing.

        Returns:
            A configuration object containing the device's information.

        """

        # Tries to check if there is an avaliable GPU
        try:
            # Gathers a list of GPUs
            gpus = GPUtil.getGPUs()

            # For each GPU
            for g in gpus:
                # Logs its information
                logging.info(g.name)

            # Calculates the load and memory per process
            load_per_process, mem_per_process = self.get_gpu_config()

            # Calculates the maximum possible load for an avaliable GPU
            max_load = 1 - load_per_process

            # # Calculates the maximum possible memory for an avaliable GPU
            max_mem = 1 - mem_per_process

            # Gathers the first avaliable GPU
            device_id = GPUtil.getFirstAvailable(order='first',
                                                 maxLoad=max_load,
                                                 maxMemory=max_mem,
                                                 attempts=3,
                                                 interval=3,
                                                 verbose=False)[0]

            # Checks if the device id exists
            if device_id is not None:
                # Creates a configuration object
                config = {
                    'gpu': {
                        'DEVICE_ID': device_id,
                        'MEMORY_FRACTION': mem_per_process
                    }
                }

                return config

        # If there is no avaliable GPU
        except Exception as e:
            logging.warning(e)

            # Creates a different configuration object
            config = {'cpu': {}}

            return config
예제 #2
0
        def gpu_conf(cfg, gpu_id=None):

            if gpu_id == None:
                DEVICE_ID_LIST = GPUtil.getFirstAvailable()
                if (len(DEVICE_ID_LIST) > 0):
                    cfg.GPU_ID = DEVICE_ID_LIST[
                        0]  # grab first element from list
            else:
                cfg.GPU_ID = gpu_id

            return cfg
예제 #3
0
def pick_device():
    try:
        GPUtil.showUtilization()
        # Get the first available GPU
        DEVICE_ID_LIST = GPUtil.getFirstAvailable()
        DEVICE_ID = DEVICE_ID_LIST[0]  # grab first element from list
        # Set CUDA_VISIBLE_DEVICES to mask out all other GPUs than the first available device id
        os.environ["CUDA_VISIBLE_DEVICES"] = str(DEVICE_ID)
        logging.debug('Device ID (unmasked): ' + str(DEVICE_ID))
    except:
        logging.exception('Cannot detect GPUs')
예제 #4
0
def processing(path, word_index, input_length, x_train):
    """Processing string array with pretrained vectors.

    convert an n dimension string array into n * k * m dimension float numpy array. Each k * m array represents
    a string. k is the input_length which means an upper bound of the string length, for string shorter than
    k will be pad and longer string will be cropped. m is defined by the pretrained file.

    Args:
        path: String, path where the pre trained files stored.
        word_index: Dictionary, contains word with tokenlized index.
        input_length: Int, an upper bound of the string length.
        x_train: String array.

    Returns:
        x_train: Numpy array as processed x_train.
    """
    import tensorflow as tf

    embedding_matrix = load_pretrain(path=path, word_index=word_index)

    # Get the first available GPU
    device_id_list = GPUtil.getFirstAvailable()
    device_id = device_id_list[0]  # grab first element from list

    # Set CUDA_VISIBLE_DEVICES to mask out all other GPUs than the first available device id
    os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id)
    device = '/gpu:0'
    with tf.device(device):
        from keras import Input, Model
        from keras import backend
        from keras.layers import Embedding
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        backend.set_session(sess)
        print("generating preprocessing model...")
        embedding_layer = Embedding(len(word_index) + 1,
                                    Constant.EMBEDDING_DIM,
                                    weights=[embedding_matrix],
                                    input_length=input_length,
                                    trainable=False)

        sequence_input = Input(shape=(input_length,), dtype='int32')
        embedded_sequences = embedding_layer(sequence_input)
        model = Model(sequence_input, embedded_sequences)
        print("converting text to vector...")
        x_train = model.predict(x_train)
        del model

    return x_train
예제 #5
0
    def get_gpufirstavailable(self):
        """
            根据GPU负载以及显存使用量返回第一个可用GPU_id,当无可用GPU时,将报错
            getAvailable参数均可用,含义一致
            attempts: 表示无法获取可用GPU时,尝试重复获取次数
            interval:  表示每次获取可用GPU时,时间间隔(秒)
            verbose:  表示在获取到最佳可用GPU时,是否打印尝试次数
            """
        GPUfirstavailable = GPUtil.getFirstAvailable(order=self.order,
                                                     attempts=self.attempts,
                                                     interval=self.interval,
                                                     verbose=self.verbose)

        return GPUfirstavailable
예제 #6
0
def get_gpu_info():
    """
    :return:
    """
    gpulist = []
    GPUtil.showUtilization()

    # 获取多个GPU信息,存在列表
    for gpu in Gpus:
        print('GPU.id:', gpu.id)
        print('GPU总量:', gpu.memoryTotal)
        print('GPU使用量:', gpu.memoryUsed)
        print('GPU使用占比:', gpu.memoryUtil * 100)
        print('GPU.id:', gpu.id)
        # 按GPU逐个添加信息
        gpulist.append(
            [gpu.id, gpu.memoryTotal, gpu.memoryUsed, gpu.memoryUtil * 100])
    """    
    根据GPU负载以及显存使用量返回可用GPU_id列表
    first: 返回的gpu可用id按升序排列
    limit: 返回可用GPU的id数量
    maxload: GPU负载率最大限制(超过该值,将不会返回)
    maxMemory:  GPU显存使用率最大限制(超过该值,将不会返回)
    includeNan:  是否包括负载或内存使用为NaN的GPU
    excludeID:  排除的GPU_id列表
    excludeUUID:  类似excludeID,将ID替换成UUID
    """
    GPUavailable = GPUtil.getAvailable(order='first',
                                       limit=1,
                                       maxLoad=0.5,
                                       maxMemory=0.5,
                                       includeNan=False,
                                       excludeID=[],
                                       excludeUUID=[])
    gpulist.append(GPUavailable)
    """
    根据GPU负载以及显存使用量返回第一个可用GPU_id,当无可用GPU时,将报错
    getAvailable参数均可用,含义一致
    attempts: 表示无法获取可用GPU时,尝试重复获取次数
    interval:  表示每次获取可用GPU时,时间间隔(秒)
    verbose:  表示在获取到最佳可用GPU时,是否打印尝试次数
    """
    GPUfirstavailable = GPUtil.getFirstAvailable(order='first',
                                                 attempts=1,
                                                 interval=900,
                                                 verbose=False)

    gpulist.append(GPUfirstavailable)
    return gpulist
def WaitForGPU(wait=300):
    GPUavailable = False
    while not GPUavailable:
        try:
            if not 'DEVICE_ID' in locals():
                DEVICE_ID = GPUtil.getFirstAvailable()[0]
                print('Using GPU', DEVICE_ID)
            os.environ["CUDA_VISIBLE_DEVICES"] = str(DEVICE_ID)
            GPUavailable = True
            return
        except Exception as e:
            # No GPU available
            print('Waiting for GPU...')
            GPUavailable = False
            time.sleep(wait)
예제 #8
0
    def lauchCorrMutProgram(self, aligFormatedName, ignoreGPU=False):
        nrows, ncols = self.getAligsDims(aligFormatedName)
        memoryRequired = 4 * (4 * (ncols * ncols * 32 * 21 + ncols * 20) +
                              23 * nrows * ncols + nrows +
                              ncols * ncols) + 2 * nrows * ncols + 1024
        tmpResults = os.path.basename(aligFormatedName)
        tmpResults = os.path.join(self.tmp, tmpResults)

        try:
            gpuNumber = GPUtil.getFirstAvailable(order='first',
                                                 maxLoad=0.3,
                                                 maxMemory=0.3,
                                                 attempts=2,
                                                 interval=3)[0]
        except (RuntimeError, OSError, ValueError):
            gpuNumber = None

        wasRunOnGPU = False
        if not ignoreGPU and memoryRequired * 1.1 < self.getTotalGPUMemory(
                gpuNumber):  #*1.1 as a margin of tolerance
            cmdArray = [
                self.ccmPredBin, "-R", "-d",
                str(gpuNumber), aligFormatedName, tmpResults
            ]
            wasRunOnGPU = True
        else:
            cmdArray = [
                self.ccmPredBin, "-R", "-t",
                str(self.corrMutNThrs), aligFormatedName, tmpResults
            ]
        print(" ".join(cmdArray))
        process = Popen(cmdArray, stdout=PIPE, stderr=PIPE)
        processOut = process.communicate()
        try:
            iterOfCorrelatedRows = self.processOutput(processOut, tmpResults)
        except ValueError as e:
            print(e)
            iterOfCorrelatedRows = None

        if iterOfCorrelatedRows is None and wasRunOnGPU == True:
            print("Error running ccmpred on gpu, trying cpu")
            iterOfCorrelatedRows = self.lauchCorrMutProgram(aligFormatedName,
                                                            ignoreGPU=True)
        return iterOfCorrelatedRows
예제 #9
0
def prepare_environment(resource_limit, log):
    '''	Prepares the environment by choosing one GPU to run on, 
		adjusts CUDA_VISIBLE_DEVICES env var for TF,
		sets the max. process length so that the training won't time out.'''
    try:
        DEVICE_ID_LIST = GPUtil.getFirstAvailable(order='last',
                                                  maxLoad=0.85,
                                                  verbose=True)
        DEVICE_ID = DEVICE_ID_LIST[0]
        os.environ["CUDA_VISIBLE_DEVICES"] = str(DEVICE_ID)
        log('Preparing environment by choosing a gpu {} and setting resource limit={}'
            .format(DEVICE_ID, resource_limit))
    except:
        print('No GPU found, continuing in CPU mode.')
    try:
        soft, hard = resource.getrlimit(resource.RLIMIT_CPU)
        resource.setrlimit(resource.RLIMIT_CPU, (resource_limit, hard))
    except:
        print('No limit set.')
예제 #10
0
def create_model(g_conv_dim=64, d_conv_dim=64, n_res_blocks=6):
    """
    Builds the generators and discriminators.
    """

    # Instantiate generators
    G_XtoY = CycleGenerator(conv_dim=g_conv_dim, n_res_blocks=n_res_blocks)
    G_YtoX = CycleGenerator(conv_dim=g_conv_dim, n_res_blocks=n_res_blocks)
    # Instantiate discriminators
    D_X = Discriminator(conv_dim=d_conv_dim)
    D_Y = Discriminator(conv_dim=d_conv_dim)
    
    device = torch.device(GPUtil.getFirstAvailable()[0] if torch.cuda.is_available() else "cpu")
    print('device=',device)
    G_XtoY.to(device)
    G_YtoX.to(device)
    D_X.to(device)
    D_Y.to(device)

    return G_XtoY, G_YtoX, D_X, D_Y, device
def gpu_setup(gpu_id):
    #set up GPUS
    os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
    os.environ['CUDA_VISIBLE_DEVICES'] = ""
    if gpu_id == "auto":
        try:
            #try to find empty gpu automaticaly
            import GPUtil
            gpu_id = GPUtil.getFirstAvailable(order = 'memory', maxLoad=0.5, maxMemory=0.5, attempts=1, interval=900, verbose=False)
            gpu_id = gpu_id[0]
        except:
            print("can't import GPUtil. maybe you can do: pip install gputil")
            print("gpu id is set to -1")
            gpu_id = -1
    
    gpu_id = int(gpu_id)
    if gpu_id >= 0:
        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("gpu id: %s"%gpu_id)
    print("using device: %s"%device)
    return device
예제 #12
0
def configure_gpu_tf():
    """ This is an example for how to customise the search for a GPU for a specific job depending on
    hardware/organisational requirements. In this case, we have a machine with two GPUs on which we want to support
    three simultaneous GPU jobs (& unlimited CPU). """

    try:
        # locate available devices & set required environment variables
        available_device_ids = GPUtil.getFirstAvailable(order='first',
                                                        maxLoad=0.7,
                                                        maxMemory=0.7,
                                                        attempts=1,
                                                        interval=10)
        available_device_id = available_device_ids[0]
        os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
        os.environ['CUDA_VISIBLE_DEVICES'] = str(available_device_id)
        print(f"\n GPU Found! running on GPU:{available_device_id}\n")

        # set GPU configuration (use all GPU memory if device 0, else use <50% of memory)
        tf.debugging.set_log_device_placement(False)
        physical_gpu = tf.config.experimental.list_physical_devices('GPU')[0]

        if available_device_id == 0:
            tf.config.experimental.set_memory_growth(physical_gpu, True)
        else:
            tf.config.experimental.set_virtual_device_configuration(
                physical_gpu, [
                    tf.config.experimental.VirtualDeviceConfiguration(
                        memory_limit=4500)
                ])
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            assert len(
                logical_gpus
            ) == 1, "error creating virtual GPU to fractionally use memory"

    # if we can't find a GPU, or they are all busy, default to using CPU
    except RuntimeError:
        print("\n No GPUs available... running on CPU\n")
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
예제 #13
0
def configure_gpu(prefered: Optional[int]) -> None:
    """
    Configure the CUDA GPU (if applicable).
    """
    try:
        import GPUtil  # type: ignore
    except ImportError:
        warnings.warn("Could not import GPUtil: using default GPU.")
    else:
        # Use an equal slice of the machine as the number of physical cores.
        # This is usually the ideal number of processes before resource contention.
        max_resources = 1. - 1. / full_cores_available()
        limits = dict(maxLoad=max_resources, maxMemory=max_resources)
        if prefered is None:
            # Select an available GPU.
            device_id, = GPUtil.getFirstAvailable(
                order='random',
                attempts=2 * 60,  # try for an hour to get a GPU
                interval=30,  # Try every 30 seconds.
                verbose=True,
                **limits)
        else:
            # Get the GPUs with the LEAST memory utilization.
            available = GPUtil.getAvailable(order='memory', limit=16, **limits)
            assert len(available) >= 1
            if prefered in available:
                device_id = prefered
            else:
                logger.warn("Requested GPU %d unavaible", prefered)
                # Since the preference is unavailable, use the GPU with the least
                # allocated memory.
                device_id = available[0]

        # Set the prefered GPU ID.
        logger.info("Using GPU %d", device_id)
        os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id)
예제 #14
0
# $
# $ pip install bert-serving-server
# $ pip install bert-serving-client
# $

# using BertClient inside tf.data API

import json
import os
import time

import GPUtil
import tensorflow as tf
from bert_serving.client import BertClient

os.environ['CUDA_VISIBLE_DEVICES'] = str(GPUtil.getFirstAvailable())

train_fp = ['/data/cips/data/larry-autoencoder/cail_0518/data_train.json']
batch_size = 256
num_parallel_calls = 4
num_concurrent_clients = 10  # should be greater than `num_parallel_calls`

bc_clients = [
    BertClient(show_server_config=False) for _ in range(num_concurrent_clients)
]


def get_encodes(x):
    # x is `batch_size` of lines, each of which is a json object
    samples = [json.loads(l) for l in x]
    text = [s['fact'][-50:] for s in samples]
예제 #15
0
파일: tlearn.py 프로젝트: kipoi/manuscript
        default=-1,
        type=int,
        help='Which gpu to use. If -1, determine automatically')
    args = parser.parse_args()
    dl_kwargs_train = parse_json_file_str(args.dl_kwargs_train)
    dl_kwargs_eval = parse_json_file_str(args.dl_kwargs_eval)
    if args.add_n_hidden == "":
        hidden = []
    else:
        hidden = [int(x) for x in args.add_n_hidden.split(",")]
    # -------
    odir = Path(args.output)
    odir.mkdir(parents=True, exist_ok=True)

    if args.gpu == -1:
        gpu = GPUtil.getFirstAvailable(attempts=3, includeNan=True)[0]
    else:
        gpu = args.gpu
    create_tf_session(gpu)

    # Get the model and the dataloader
    model = kipoi.get_model(args.model, args.source)
    if args.dataloader is not None:
        Dl = kipoi.get_dataloader_factory(args.dataloader,
                                          args.dataloader_source)
    else:
        Dl = model.default_dataloader

    if not model.type == "keras":
        raise ValueError("Only keras models are supported")
예제 #16
0
from __future__ import print_function
import numpy as np
from six.moves import range
import h5py
import scipy.io as io
import sys, os
import itertools as it
import time
import GPUtil
#from api.resources.preprocessing.DeepVess.TrainDeepVess import train_deep_vess
#from api.resources.preprocessing.DeepVess.DeepVessModel import define_deepvess_architecture
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
deviceID = GPUtil.getFirstAvailable(order='random',
                                    maxLoad=0.7,
                                    maxMemory=0.6,
                                    attempts=10,
                                    interval=100,
                                    verbose=True)
os.environ["CUDA_VISIBLE_DEVICES"] = str(deviceID[0])
import tensorflow as tf


def start_tracing_model(inputData,
                        isTrain=False,
                        isForward=True,
                        padSize=((3, 3), (16, 16), (16, 16), (0, 0))):
    """

    :param inputData:
    :param isTrain: Change isTrain to True if you want to train the network
    :param isForward:  Change isForward to True if you want to test the network
예제 #17
0
파일: ppo_gym.py 프로젝트: gp1702/temp
import GPUtil
import datetime
import os
DEVICE_ID_LIST = GPUtil.getFirstAvailable()
DEVICE_ID = DEVICE_ID_LIST[0]
os.environ["CUDA_VISIBLE_DEVICES"] = str(DEVICE_ID)
import argparse
import gym
import os
import sys
import pickle
import time
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from utils import *
from models.mlp_policy import Policy
from models.mlp_critic import Value
from models.mlp_policy_disc import DiscretePolicy
from torch.autograd import Variable
from core.ppo import ppo_step
from core.common import estimate_advantages
from core.agent import Agent

Tensor = DoubleTensor
torch.set_default_tensor_type('torch.DoubleTensor')

parser = argparse.ArgumentParser(description='PyTorch PPO example')
parser.add_argument('--env-name',
                    default="Reacher-v1",
                    metavar='G',
                    help='name of the environment to run')
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    if hasattr(Config, 'use_inter_evidence_comparison'):
        use_inter_evidence_comparison = Config.use_inter_evidence_comparison
    else:
        use_inter_evidence_comparison = False
    # 'esim_inter_evidence' model and 'esim_inter_evidence_claim_evidences_comparison' models need inter evidence inputs
    use_inter_evidence_comparison = use_inter_evidence_comparison or Config.estimator_name in {
        'esim_inter_evidence', 'esim_inter_evidence_claim_evidences_comparison'
    }
    if hasattr(Config, 'use_claim_evidences_comparison'):
        use_claim_evidences_comparison = Config.use_claim_evidences_comparison
    else:
        use_claim_evidences_comparison = False
    # 'esim_inter_evidence_claim_evidences_comparison' model needs claim-evidence inputs
    use_claim_evidences_comparison = use_claim_evidences_comparison or Config.estimator_name in {
        'esim_inter_evidence_claim_evidences_comparison'
    }
    if hasattr(Config, 'use_extra_features'):
        use_extra_features = Config.use_extra_features
    else:
        use_extra_features = False
    if hasattr(Config, 'use_numeric_feature'):
        use_numeric_feature = Config.use_numeric_feature
    else:
        use_numeric_feature = False
    # 'esim_num_feature' model needs numeric feature inputs
    use_numeric_feature = use_numeric_feature or Config.estimator_name in {
        'esim_num_feature'
    }
    if hasattr(Config, 'is_snopes'):
        is_snopes = Config.is_snopes
    else:
        is_snopes = False
    logger.debug("is_snopes: " + str(is_snopes))
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM arguments: " + str(Config.esim_end_2_end_hyper_param))
    logger.info("use_inter_sentence_comparison: " +
                str(use_inter_evidence_comparison))
    logger.info("use_extra_features: " + str(use_extra_features))
    logger.info("use_numeric_feature: " + str(use_numeric_feature))
    logger.info("use_claim_evidences_comparison: " +
                str(use_claim_evidences_comparison))
    if mode == RTERunPhase.train:
        # # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(
                Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                (X_dict, y_train) = pickle.load(f)
        else:
            training_set, vocab, embeddings, _, _ = embed_data_set_with_glove_2(
                Config.training_set_file,
                Config.db_path,
                glove_path=Config.glove_path,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_claim_size,
                is_snopes=is_snopes)
            h_sent_sizes = training_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            training_set['data']['h_sent_sizes'] = np.expand_dims(
                h_sent_sizes, 1)
            training_set['data']['h_sizes'] = h_sizes
            training_set['data']['h_np'] = np.expand_dims(
                training_set['data']['h_np'], 1)

            valid_set, _, _, _, _ = embed_data_set_with_glove_2(
                Config.dev_set_file,
                Config.db_path,
                vocab_dict=vocab,
                glove_embeddings=embeddings,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_claim_size,
                is_snopes=is_snopes)
            h_sent_sizes = valid_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
            valid_set['data']['h_sizes'] = h_sizes
            valid_set['data']['h_np'] = np.expand_dims(
                valid_set['data']['h_np'], 1)
            if use_extra_features:
                assert hasattr(
                    Config, 'feature_path'
                ), "Config should has feature_path if Config.use_feature is True"
                training_claim_features, training_evidence_features = load_feature_by_data_set(
                    Config.training_set_file, Config.feature_path,
                    Config.max_sentences)
                valid_claim_features, valid_evidence_features = load_feature_by_data_set(
                    Config.dev_set_file, Config.feature_path,
                    Config.max_sentences)
                training_set['data']['h_feats'] = training_claim_features
                training_set['data']['b_feats'] = training_evidence_features
                valid_set['data']['h_feats'] = valid_claim_features
                valid_set['data']['b_feats'] = valid_evidence_features
            if use_numeric_feature:
                training_num_feat = number_feature(Config.training_set_file,
                                                   Config.db_path,
                                                   Config.max_sentences,
                                                   is_snopes)
                valid_num_feat = number_feature(Config.dev_set_file,
                                                Config.db_path,
                                                Config.max_sentences,
                                                is_snopes)
                training_set['data']['num_feat'] = training_num_feat
                valid_set['data']['num_feat'] = valid_num_feat
            if use_inter_evidence_comparison:
                training_concat_sent_indices, training_concat_sent_sizes = generate_concat_indices_for_inter_evidence(
                    training_set['data']['b_np'],
                    training_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                training_set['data'][
                    'b_concat_indices'] = training_concat_sent_indices
                training_set['data'][
                    'b_concat_sizes'] = training_concat_sent_sizes
                valid_concat_sent_indices, valid_concat_sent_sizes = generate_concat_indices_for_inter_evidence(
                    valid_set['data']['b_np'],
                    valid_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                valid_set['data'][
                    'b_concat_indices'] = valid_concat_sent_indices
                valid_set['data']['b_concat_sizes'] = valid_concat_sent_sizes
            if use_claim_evidences_comparison:
                training_all_evidences_indices, training_all_evidences_sizes = generate_concat_indices_for_claim(
                    training_set['data']['b_np'],
                    training_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                training_set['data'][
                    'b_concat_indices_for_h'] = training_all_evidences_indices
                training_set['data'][
                    'b_concat_sizes_for_h'] = training_all_evidences_sizes
                valid_all_evidences_indices, valid_all_evidences_sizes = generate_concat_indices_for_claim(
                    valid_set['data']['b_np'],
                    valid_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                valid_set['data'][
                    'b_concat_indices_for_h'] = valid_all_evidences_indices
                valid_set['data'][
                    'b_concat_sizes_for_h'] = valid_all_evidences_sizes
            X_dict = {
                'X_train': training_set['data'],
                'X_valid': valid_set['data'],
                'y_valid': valid_set['label'],
                'embedding': embeddings
            }
            y_train = training_set['label']
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump((X_dict, y_train),
                                f,
                                protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        estimator.fit(X_dict, y_train)
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name,
                                          Config.ckpt_folder)
        vocab, embeddings = load_whole_glove(Config.glove_path)
        vocab = vocab_map(vocab)
        test_set, _, _, _, _ = embed_data_set_with_glove_2(
            Config.test_set_file,
            Config.db_path,
            vocab_dict=vocab,
            glove_embeddings=embeddings,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_claim_size,
            is_snopes=is_snopes)
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1)
        if use_extra_features:
            assert hasattr(
                Config, 'feature_path'
            ), "Config should has feature_path if Config.use_feature is True"
            test_claim_features, test_evidence_features = load_feature_by_data_set(
                Config.test_set_file, Config.feature_path,
                Config.max_sentences)
            test_set['data']['h_feats'] = test_claim_features
            test_set['data']['b_feats'] = test_evidence_features
        if use_numeric_feature:
            test_num_feat = number_feature(Config.test_set_file,
                                           Config.db_path,
                                           Config.max_sentences, is_snopes)
            test_set['data']['num_feat'] = test_num_feat
        x_dict = {'X_test': test_set['data'], 'embedding': embeddings}
        if use_inter_evidence_comparison:
            test_concat_sent_indices, test_concat_sent_sizes = generate_concat_indices_for_inter_evidence(
                test_set['data']['b_np'], test_set['data']['b_sent_sizes'],
                Config.max_sentence_size, Config.max_sentences)
            test_set['data']['b_concat_indices'] = test_concat_sent_indices
            test_set['data']['b_concat_sizes'] = test_concat_sent_sizes
        if use_claim_evidences_comparison:
            test_all_evidences_indices, test_all_evidences_sizes = generate_concat_indices_for_claim(
                test_set['data']['b_np'], test_set['data']['b_sent_sizes'],
                Config.max_sentence_size, Config.max_sentences)
            test_set['data'][
                'b_concat_indices_for_h'] = test_all_evidences_indices
            test_set['data']['b_concat_sizes_for_h'] = test_all_evidences_sizes
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        predictions = estimator.predict(
            x_dict, restore_param_required=restore_param_required)
        generate_submission(predictions, test_set['id'], Config.test_set_file,
                            Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    return estimator
예제 #19
0
                # optimize every params that require grad
                if optimizer_name == 'Adam':
                    optimizer = torch.optim.Adam(filter(
                        lambda p: p.requires_grad, model.parameters()),
                                                 lr=opt_lr)

                # attempt to sent to GPU, else train over CPU
                model_sent_to_device = False
                sleep_time = 30
                while not model_sent_to_device and sleep_time < 4800:
                    # get free device
                    device = torch.device('cuda')
                    try:
                        device_id = GPUtil.getFirstAvailable(order='memory',
                                                             maxLoad=1.0,
                                                             maxMemory=0.8,
                                                             verbose=False)[0]
                        # send to least used GPU
                        print('Using GPU:', device_id)
                        with torch.cuda.device(device_id):
                            model.to(device)
                            model_sent_to_device = True

                    except Exception as e:
                        print(e)
                        sleep_time = 1.66 * sleep_time
                        print('GPU error. Wait {}s and continue'.format(
                            sleep_time))
                        time.sleep(sleep_time)

                if not model_sent_to_device:
예제 #20
0
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    if hasattr(Config, 'is_snopes'):
        is_snopes = Config.is_snopes
    else:
        is_snopes = False
    logger.debug("is_snopes: " + str(is_snopes))
    if mode == RTERunPhase.train:
        # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(
                Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                (X_train, Y_labels_train, X_valid,
                 Y_labels_valid) = pickle.load(f)
        else:
            # process training JSONL file
            X_train, Y_labels_train = read_data_set_from_jsonl(
                Config.training_set_file,
                Config.db_path,
                num_sentences=Config.max_sentences,
                is_snopes=is_snopes)
            X_valid, Y_labels_valid = read_data_set_from_jsonl(
                Config.dev_set_file,
                Config.db_path,
                num_sentences=Config.max_sentences,
                is_snopes=is_snopes)
            b_train = X_train['b']
            X_train['b_sizes'] = get_num_sents_of_bodies(b_train)
            for i, sample in enumerate(b_train):
                if len(sample) < Config.max_sentences:
                    for _ in range(Config.max_sentences - len(sample)):
                        sample.append(" ")
                b_train[i] = np.asarray(sample)
            b_train = np.asarray(b_train)
            X_train['b'] = b_train
            logger.debug("b_train.shape: " + str(b_train.shape))
            b_valid = X_valid['b']
            X_valid['b_sizes'] = get_num_sents_of_bodies(b_valid)
            for i, sample in enumerate(b_valid):
                if len(sample) < Config.max_sentences:
                    for _ in range(Config.max_sentences - len(sample)):
                        sample.append(" ")
                b_valid[i] = np.asarray(sample)
            b_valid = np.asarray(b_valid)
            X_valid['b'] = b_valid
            logger.debug("b_valid.shape: " + str(b_valid.shape))
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump(
                        (X_train, Y_labels_train, X_valid, Y_labels_valid),
                        f,
                        protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        estimator.fit(X_train, Y_labels_train, X_valid, Y_labels_valid)
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name,
                                          Config.ckpt_folder)
        X_test, Y_labels_test = read_data_set_from_jsonl(
            Config.test_set_file,
            Config.db_path,
            num_sentences=Config.max_sentences,
            is_snopes=is_snopes)
        b_test = X_test['b']
        X_test['b_sizes'] = get_num_sents_of_bodies(b_test)
        for i, sample in enumerate(b_test):
            if len(sample) < Config.max_sentences:
                for _ in range(Config.max_sentences - len(sample)):
                    sample.append(" ")
            b_test[i] = np.asarray(sample)
        b_test = np.asarray(b_test)
        X_test['b'] = b_test
        logger.debug("b_test.shape: " + str(b_test.shape))
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        predictions = estimator.predict(X_test, restore_param_required)
        generate_submission(predictions, X_test['id'], Config.test_set_file,
                            Config.submission_file)
        if Y_labels_test:
            print_metrics(Y_labels_test, predictions, logger)
    return estimator
예제 #21
0
import torch
import os
import GPUtil
from data_load import data_load as DL
import numpy as np
devices = "%d" % GPUtil.getFirstAvailable(order="memory")[0]
os.environ["CUDA_VISIBLE_DEVICES"] = str(devices)

# torch.cuda.manual_seed(1234)
# l = torch.cuda.get_rng_state()
# torch.bernoulli(torch.full((5,5), 0.5, device='cuda'))
#
# torch.cuda.manual_seed(1234)
# l2 = torch.cuda.get_rng_state()
#
# (l==l2).all().item()

np.random.seed(1234)
torch.manual_seed(1234)
torch.cuda.manual_seed(1234)
torch.cuda.manual_seed_all(1234)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

n_sample = 10
n_dim = 3
data = np.random.randint((50), size=(n_sample, n_dim))
label = np.random.randint((3), size=(n_sample,))
train_loader = DL.convert_Dloader(5, data, label, is_training=True, num_workers=0, shuffle=True)

for epoch in range(3):
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM arguments: " + str(Config.esim_hyper_param))
    logger.info("this script is only for FEVER dataset")
    if mode == RTERunPhase.train:
        # # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(
                Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                (X_dict, y_train) = pickle.load(f)
        else:
            training_set, vocab, embeddings, _, _ = embed_data_set_with_glove_2(
                Config.training_set_file,
                Config.db_path,
                glove_path=Config.glove_path,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_claim_size)
            h_sent_sizes = training_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            training_set['data']['h_sent_sizes'] = np.expand_dims(
                h_sent_sizes, 1)
            training_set['data']['h_sizes'] = h_sizes
            training_set['data']['h_np'] = np.expand_dims(
                training_set['data']['h_np'], 1)
            training_set['data']['scores'] = load_scores(
                Config.training_set_file, Config.max_sentences)

            valid_set, _, _, _, _ = embed_data_set_with_glove_2(
                Config.dev_set_file,
                Config.db_path,
                vocab_dict=vocab,
                glove_embeddings=embeddings,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_claim_size)
            h_sent_sizes = valid_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
            valid_set['data']['h_sizes'] = h_sizes
            valid_set['data']['h_np'] = np.expand_dims(
                valid_set['data']['h_np'], 1)
            valid_set['data']['scores'] = load_scores(Config.dev_set_file,
                                                      Config.max_sentences)

            X_dict = {
                'X_train': training_set['data'],
                'X_valid': valid_set['data'],
                'y_valid': valid_set['label'],
                'embedding': embeddings
            }
            y_train = training_set['label']
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump((X_dict, y_train),
                                f,
                                protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        estimator.fit(X_dict, y_train)
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name,
                                          Config.ckpt_folder)
        vocab, embeddings = load_whole_glove(Config.glove_path)
        vocab = vocab_map(vocab)
        test_set, _, _, _, _ = embed_data_set_with_glove_2(
            Config.test_set_file,
            Config.db_path,
            vocab_dict=vocab,
            glove_embeddings=embeddings,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_claim_size)
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1)
        test_set['data']['scores'] = load_scores(Config.test_set_file,
                                                 Config.max_sentences)
        x_dict = {'X_test': test_set['data'], 'embedding': embeddings}
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        predictions = estimator.predict(
            x_dict, restore_param_required=restore_param_required)
        generate_submission(predictions, test_set['id'], Config.test_set_file,
                            Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    return estimator
예제 #23
0
def bestGPU():
    try:
        return GPUtil.getFirstAvailable()
    except RuntimeError:
        return [random.randint(0, 7)]
예제 #24
0
''' Script that selects first available GPU'''
import os
import GPUtil
devicesid=GPUtil.getFirstAvailable()
os.environ["CUDA_VISIBLE_DEVICES"] = str(devicesid[0])
os.environ['TF_CPP_MIN_LOG_LEVEL'] = "1"
print('GPU %d was selected' %devicesid[0])
예제 #25
0
print(GPU.getAvailable(order='last', limit=1))

# Get 1 random available GPU
print('Random available: '),
print(GPU.getAvailable(order='random'))

# Get 1 available GPU, ordered by GPU load ascending
print('First available weighted by GPU load ascending: '),
print(GPU.getAvailable(order='load', limit=1))

# Get all available GPU with max load of 10%, ordered by memory ascending
print('All available weighted by memory load ascending: '),
print(GPU.getAvailable(order='memory', limit=999, maxLoad=0.1))

# Get the first available GPU
firstGPU = GPU.getFirstAvailable()
print('First available GPU id:'),
print(firstGPU)

# Get the first available GPU, where memory usage is less than 90% and processing is less than 80%
firstGPU = GPU.getFirstAvailable(maxMemory=0.9, maxLoad=0.8)
print('First available GPU id (memory < 90%, load < 80%):'),
print(firstGPU)

# Get the first available GPU, where processing is less than 1%
firstGPU = GPU.getFirstAvailable(attempts=5,
                                 interval=5,
                                 maxLoad=0.01,
                                 verbose=True)
print('First available GPU id (load < 1%):'),
print(firstGPU)
예제 #26
0
def train(config):
    cwd_slash = gen_cwd_slash(config)

    os.makedirs(config['_cwd'], exist_ok=True)

    if config['cuda_visible_devices'] is not None:
        debug(f"Using GPU: {config['cuda_visible_devices']}")
        os.environ['CUDA_VISIBLE_DEVICES'] = config['cuda_visible_devices']
    else:
        avail_gpu = str(GPUtil.getFirstAvailable()[0])
        debug(f"Selecting the first available GPU: {avail_gpu}")
        os.environ['CUDA_VISIBLE_DEVICES'] = avail_gpu

    train_windowed_anno = pd.read_csv(config['path_to_train_windowed_anno_cache'], index_col=0)
    valid_windowed_anno = pd.read_csv(config['path_to_valid_windowed_anno_cache'], index_col=0)

    train_balanced_generator = data_gen_from_anno_gen(
        gen_even_batches(
            train_windowed_anno,
            config,
            target_col='corrected_target',
        ),
        config,
        target_col='corrected_target',
        do_augment=True,
    )

    if config['n_batches_preview'] > 0:
        debug('preview_generator ...')
        preview_generator(
            train_balanced_generator,
            config,
            filename_prefix=f"train_balanced_generator_{'_'.join([str(x) for x in config['class_ids']])}",
            n_batches=config['n_batches_preview'],
        )
        debug('preview_generator done')

    train_generator = data_gen_from_anno_gen(
        batching_row_gen(randomize_and_loop(train_windowed_anno), config['batch_size']),
        config,
        target_col='corrected_target',
        do_augment=True,
    )

    if config['n_batches_preview'] > 0:
        debug('preview_generator ...')
        preview_generator(
            train_generator,
            config,
            filename_prefix=f"train_generator_{'_'.join([str(x) for x in config['class_ids']])}",
            n_batches=config['n_batches_preview'],
        )
        debug('preview_generator done')

    valid_balanced_generator = data_gen_from_anno_gen(
        gen_even_batches(
            valid_windowed_anno,
            config,
            target_col='corrected_target',
        ),
        config,
        target_col='corrected_target',
        do_augment=False,
    )

    if config['n_batches_preview'] > 0:
        debug('preview_generator ...')
        preview_generator(
            valid_balanced_generator,
            config,
            filename_prefix=f"valid_balanced_generator_{'_'.join([str(x) for x in config['class_ids']])}",
            n_batches=config['n_batches_preview'],
        )
        debug('preview_generator done')

    device = 'cuda'
    log_interval = 1

    train_balanced_generator = numpy_to_pytorch(train_balanced_generator)
    train_balanced_loader = ChunkIter(train_balanced_generator, config['steps_per_epoch'])
    train_generator = numpy_to_pytorch(train_generator)
    train_loader = ChunkIter(train_generator, config['steps_per_epoch'])
    valid_balanced_generator = numpy_to_pytorch(valid_balanced_generator)
    val_loader = ChunkIter(valid_balanced_generator, config['steps_per_epoch_for_valid'])

    def debug_hook(module, input_, output):
        debug(f"input_ = {input_}")
        debug(f"output = {output}")

    model = MyModel(config)
    model.to(device)

    path_to_model_checkpoint = cwd_slash('model_best_ravg_loss.pth')
    if os.path.exists(path_to_model_checkpoint):
        debug(f"loading model checkpoint from {path_to_model_checkpoint}")
        model_state_dict = torch.load(path_to_model_checkpoint)
        model.load_state_dict(model_state_dict)

    optimizer = torch.optim.Adam(model.parameters(), lr=config['starting_lr'])

    path_to_optimizer_checkpoint = cwd_slash('optimizer_best_ravg_loss.pth')
    if os.path.exists(path_to_optimizer_checkpoint):
        debug(f"loading optimizer checkpoint from {path_to_optimizer_checkpoint}")
        optimizer_state_dict = torch.load(path_to_optimizer_checkpoint)
        optimizer.load_state_dict(optimizer_state_dict)

    trainer = create_supervised_trainer(model, optimizer, F.binary_cross_entropy, device=device)
    RunningAverage(alpha=0.99).attach(trainer, 'ravg_loss')
    epoch_timer = Timer().attach(trainer, start=Events.EPOCH_STARTED)

    metrics = {
        'acc': Accuracy(),
        'val_macro_f1': MacroF1(),
        # 'nll': Loss(F.binary_cross_entropy),
    }
    evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)

    pbar = tqdm(initial=0, leave=False, total=config['steps_per_epoch'], mininterval=0.1)

    @trainer.on(Events.STARTED)
    def started_handler(engine):
        info("started_handler()")
        engine.state.last_ravg_loss = math.inf
        engine.state.best_val_macro_f1 = 0
        engine.state.best_ravg_loss = math.inf
        engine.state.lr = config['starting_lr']
        engine.state.n_restarts = 0
        engine.state.ravg_loss_improved = False
        engine.state.val_macro_f1_improved = False

        def format_log_header(fields):
            output_groups = []
            output_group = []
            for field in fields:
                name = field.get('name')
                if name is None:
                    output_groups.append(output_group)
                    output_group = []
                    continue

                display_str = name

                width = field.get('width')
                if type(width) is int:
                    if int < 0:
                        display_str.ljust(-width)
                    else:
                        display_str.rjust(width)

                output_group.append(display_str)

            output_groups.append(output_group)

            return ' | '.join([' '.join(g) for g in output_groups])

    @trainer.on(Events.ITERATION_COMPLETED)
    def iteration_completed_handler(engine):
        iter = (engine.state.iteration - 1) % config['steps_per_epoch'] + 1

        if iter % log_interval == 0:
            pbar.set_description_str(
                ' | '.join(
                    [
                        # "class_ids " + str(config['class_ids']).rjust(4),
                        "fold " + str(config['i_fold']),
                        "epoch " + str(engine.state.epoch).rjust(2),
                        "ravg_loss " + f"{trainer.state.metrics['ravg_loss']:.6f}",
                        "loss " + f"{engine.state.output:.4f}",
                    ]
                )
            )
            pbar.update(log_interval)

    max_n_restarts = 10

    @trainer.on(Events.EPOCH_COMPLETED)
    def epoch_completed_handler(engine):
        pbar.refresh()
        events = []
        # evaluator.run(train_balanced_loader)
        evaluator.run(val_loader)
        val_metrics = evaluator.state.metrics
        val_macro_f1 = val_metrics['val_macro_f1']['score'].item()
        score_details = val_metrics['val_macro_f1']['details']
        precisions = score_details['precision']
        recalls = score_details['recall']
        # log_record = {
        #     "epoch": engine.state.epoch,
        #     "ravg_loss": engine.state.metrics['ravg_loss'],
        #     "val_avg_acc": val_metrics['acc'],
        #     "val_macro_f1": val_macro_f1,
        #     "epoch_time": epoch_timer.value(),
        #     "engine.state.lr": engine.state.lr,
        # }
        os.makedirs(cwd_slash('macro_f1_details'), exist_ok=True)

        if engine.state.last_ravg_loss - engine.state.metrics['ravg_loss'] < 0.025 * engine.state.last_ravg_loss:
            engine.state.ravg_loss_improved = False
            engine.state.lr *= 0.3
            if engine.state.lr < 5e-6:
                model_state_save_path = cwd_slash(f"model_restart_{engine.state.n_restarts}.pth")
                torch.save(model.state_dict(), model_state_save_path)
                debug(f"saved {model_state_save_path}")

                model_softlink_path = cwd_slash(f"model.pth")
                debug(f"overwriting soft link {model_softlink_path}  -->  {model_state_save_path}")
                if os.path.islink(model_softlink_path):
                    os.unlink(model_softlink_path)
                os.symlink(
                    os.path.relpath(model_state_save_path, cwd_slash()),
                    model_softlink_path,
                    target_is_directory=True,
                )

                optimizer_state_save_path = cwd_slash(f"optimizer_restart_{engine.state.n_restarts}.pth")
                torch.save(optimizer.state_dict(), optimizer_state_save_path)
                debug(f"saved {optimizer_state_save_path}")

                optimizer_softlink_path = cwd_slash(f"optimizer.pth")
                debug(f"overwriting soft link {optimizer_softlink_path}  -->  {optimizer_state_save_path}")
                if os.path.islink(optimizer_softlink_path):
                    os.unlink(optimizer_softlink_path)
                os.symlink(
                    os.path.relpath(optimizer_state_save_path, cwd_slash()),
                    optimizer_softlink_path,
                    target_is_directory=True,
                )

                engine.state.n_restarts += 1
                if engine.state.n_restarts > max_n_restarts:
                    engine.terminate()
                    events.append(f"max restarts reached")
                else:
                    engine.state.last_ravg_loss = math.inf
                    engine.state.lr = config['starting_lr']
                    events.append(f"lr reset to {engine.state.lr:.1e}")
            else:
                events.append(f"lr -> {engine.state.lr:.1e}")
            for g in optimizer.param_groups:
                g['lr'] = engine.state.lr
        else:
            engine.state.ravg_loss_improved = True

        if engine.state.metrics['ravg_loss'] < engine.state.best_ravg_loss:
            engine.state.best_ravg_loss = engine.state.metrics['ravg_loss']
            debug(f"saved {cwd_slash('model_best_ravg_loss.pth')}")
            debug(f"saved {cwd_slash('optimizer_best_ravg_loss.pth')}")
            torch.save(model.state_dict(), cwd_slash('model_best_ravg_loss.pth'))
            torch.save(optimizer.state_dict(), cwd_slash('optimizer_best_ravg_loss.pth'))

        if val_macro_f1 > engine.state.best_val_macro_f1:
            engine.state.val_macro_f1_improved = True
            engine.state.best_val_macro_f1 = val_macro_f1
            debug(f"saved {cwd_slash('model_best_val_f1.pth')}")
            debug(f"saved {cwd_slash('optimizer_best_val_f1.pth')}")
            torch.save(model.state_dict(), cwd_slash('model_best_val_f1.pth'))
            torch.save(optimizer.state_dict(), cwd_slash('optimizer_best_val_f1.pth'))
        else:
            engine.state.val_macro_f1_improved = False

        log_record = [
            # {
            #     'name': 'class_labels',
            #     'value': class_ids_to_label(config['class_ids'], config),
            #     'width': -32,
            # },
            # {
            #     # ------------------
            # },
            {
                'name': 'fold',
                'value': config['i_fold'],
                'width': 1,
            },
            {
                # ------------------
            },
            {
                'name': 'epoch',
                'value': engine.state.epoch,
                'width': 3,
            },
            {
                # ------------------
            },
            {
                'name': 'ravg_loss',
                'value': engine.state.metrics['ravg_loss'],
                'display': "{:.6f}",
                'width': -9,
                'color': 'yellow' if engine.state.ravg_loss_improved else None,
            },
            {
                'name': 'val_avg_acc',
                'value': val_metrics['acc'],
                'display': "{:.4f}",
                'width': 6,
            },
            {
                # ------------------
            },
            {
                'name': 'val_macro_f1',
                'value': val_macro_f1,
                'display': "{:.6f}",
                'width': -9,
                'color': 'blue' if engine.state.val_macro_f1_improved else None,
            },
            {
                'name': 'precision',
                'value': float(precisions[0]),
                'display': "{:.4f}",
                'width': 6,
            },
            {
                'name': 'recall',
                'value': float(recalls[0]),
                'display': "{:.4f}",
                'width': 6,
            },
            {
                # ------------------
            },
            {
                'name': "epoch_time",
                'value': epoch_timer.value(),
                'display': lambda x: timedelta(seconds=x),
                'width': 15,
            },
            {
                # ------------------
            },
            {
                'name': "lr",
                'value': engine.state.lr,
                'display': "{:.1e}",
                'width': 7,
            },
            {
                # ------------------
            },
            {
                'name': "cache",
                'value': load_img.cache_info(),
                'width': 18,
            },
            {
                # ------------------
            },
            {
                'name': "events",
                'value': '; '.join(events),
            },
        ]

        def format_log_record(fields):
            output_groups = []
            output_group = []
            for field in fields:
                name = field.get('name')
                if name is None:
                    output_groups.append(output_group)
                    output_group = []
                    continue

                value = field.get('value')
                display = field.get('display')
                if type(display) is str:
                    display_str = display.format(value)
                elif callable(display):
                    display_str = str(display(value))
                else:
                    display_str = str(value)

                width = field.get('width')
                if type(width) is int:
                    if width < 0:
                        display_str = display_str.ljust(-width)
                    else:
                        display_str = display_str.rjust(width)

                color = field.get('color')
                if type(color) is str:
                    display_str = colors.color(display_str, fg=color)

                output_group.append(display_str)

            output_groups.append(output_group)

            return ' | '.join([' '.join(g) for g in output_groups])

        with open(cwd_slash('log.json'), 'a') as f:
            obj = {x['name']: x['value'] for x in log_record if type(x.get('name')) is str}
            json.dump(obj, f)
            f.write('\n')

        with open(cwd_slash('displayed_log.txt'), 'a') as f:
            f.write(format_log_record(log_record))
            f.write('\n')

        tqdm.write(format_log_record(log_record))

        load_img.reset_cache_info()
        macro_f1_df = format_macro_f1_details(val_metrics['val_macro_f1']['details'], config)
        macro_f1_df.to_csv(cwd_slash('macro_f1_details', f"epoch{engine.state.epoch:03d}_{val_macro_f1}.csv"))
        tqdm.write(repr(macro_f1_df))

        engine.state.last_ravg_loss = engine.state.metrics['ravg_loss']
        pbar.n = pbar.last_print_n = 0
        pbar.refresh()

    # banner('start training train_balanced_loader')
    trainer.run(train_balanced_loader, max_epochs=config['n_epochs'])
    # trainer.should_terminate = False
    # max_n_restarts = 10
    # banner('start training train_loader')
    # trainer.run(train_loader, max_epochs=config['n_epochs'])

    return {'id_': config['class_ids']}
import numpy as np
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import Adam
from natsort import natsorted

from Datagen import PngDataGenerator
from Losses import dice_coef_loss
from Models import BlockModel2D
from sklearn.model_selection import train_test_split

rng = np.random.RandomState(seed=1)

import GPUtil
try:
    if not 'DEVICE_ID' in locals():
        DEVICE_ID = GPUtil.getFirstAvailable()[0]
        print('Using GPU', DEVICE_ID)
    os.environ["CUDA_VISIBLE_DEVICES"] = str(DEVICE_ID)
except Exception as e:
    raise ('No GPU available')

train_datapath = '/data/Kaggle/train-png'
train_mask_path = '/data/Kaggle/train-mask'
weight_filepath = 'Kaggle_Weights.{epoch:02d}-{val_loss:.4f}.h5'

pretrain_weights_filepath = 'Best_Kaggle_Weights.02-0.61.h5'
# pretrain_weights_filepath = None

# parameters
im_dims = (512, 512)
n_channels = 1
예제 #28
0
"""By Importing this file, a certain configured GPU will be written into an
environment variable that will be read by tensorflow to select a GPU."""

import preprocessing.config as cfg
import os

if cfg.keras_cfg['set_gpu_device']:
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    if cfg.keras_cfg['gpu_auto_set']:
        import GPUtil
        try:
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable()[0])
        except:
            pass
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = cfg.keras_cfg['gpu_device']
elif 'CUDA_VISIBLE_DEVICES' in os.environ:
    del os.environ['CUDA_VISIBLE_DEVICES']
예제 #29
0
from src import utils
from src.utils import Logger
from src.utils import DEFINE_boolean
from src.utils import DEFINE_float
from src.utils import DEFINE_integer
from src.utils import DEFINE_string
from src.utils import print_user_flags

from src.cifar10.data_utils import read_data, read_data_corrupt_label
from src.cifar10.general_controller import GeneralController
from src.cifar10.general_child import GeneralChild

from src.cifar10.micro_controller import MicroController
from src.cifar10.micro_child import MicroChild

deviceIDs = GPUtil.getFirstAvailable()
print('Available GPU: {}'.format(deviceIDs))
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(map(str, deviceIDs))

flags = tf.app.flags
FLAGS = flags.FLAGS

DEFINE_boolean("reset_output_dir", False, "Delete output_dir if exists.")
DEFINE_string("data_path", "", "")
DEFINE_string("output_dir", "", "")
DEFINE_string("data_format", "NHWC", "'NHWC' or 'NCWH'")
DEFINE_string("search_for", None, "Must be [macro|micro]")

DEFINE_integer("batch_size", 32, "")

DEFINE_integer("num_epochs", 300, "")
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM arguments: " + str(Config.esim_hyper_param))
    if hasattr(Config, 'is_snopes'):
        is_snopes = Config.is_snopes
    else:
        is_snopes = False
    logger.debug("is_snopes: " + str(is_snopes))
    if mode == RTERunPhase.train:
        # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(
                Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                dataset_list = pickle.load(f)
        else:
            # process training JSONL file
            training_set, _, _ = embed_data_set_for_elmo(
                Config.training_set_file,
                Config.db_path,
                threshold_b_sent_num=Config.max_sentences,
                threshold_h_sent_size=Config.max_claim_size,
                threshold_b_sent_size=Config.max_sentence_size,
                is_snopes=is_snopes)
            h_sent_sizes = training_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            training_set['data']['h_sent_sizes'] = np.expand_dims(
                h_sent_sizes, 1)
            training_set['data']['h_sizes'] = h_sizes
            training_set['data']['h_tokens'] = np.expand_dims(
                training_set['data']['h_tokens'], 1)
            # training_set['data']['h_ft_np'] = np.expand_dims(training_set['data']['h_ft_np'], 1)

            valid_set, _, _ = embed_data_set_for_elmo(
                Config.dev_set_file,
                Config.db_path,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_claim_size,
                is_snopes=is_snopes)
            h_sent_sizes = valid_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
            valid_set['data']['h_sizes'] = h_sizes
            valid_set['data']['h_tokens'] = np.expand_dims(
                valid_set['data']['h_tokens'], 1)

            dataset_list = [training_set, valid_set]
            # save processed training data
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump(dataset_list,
                                f,
                                protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        estimator.fit(dataset_list[0]['data'], dataset_list[0]['label'],
                      dataset_list[1]['data'], dataset_list[1]['label'])
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name,
                                          Config.ckpt_folder)
        test_set, _, _ = embed_data_set_for_elmo(
            Config.test_set_file,
            Config.db_path,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_claim_size,
            is_snopes=is_snopes)
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_tokens'] = np.expand_dims(
            test_set['data']['h_tokens'], 1)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        logger.debug("CUDA_VISIBLE_DEVICES: " +
                     os.environ['CUDA_VISIBLE_DEVICES'])
        predictions = estimator.predict(
            test_set['data'], restore_param_required=restore_param_required)
        generate_submission(predictions, test_set['id'], Config.test_set_file,
                            Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    return estimator