def embed(self, vocabulary, dtype='float32', token_not_found='ignore'): """Any word not found in the vocabulary will be set to all-zeros""" # ====== check vocab ======= # if not isinstance(vocabulary, Mapping): raise ValueError('"vocabulary" must be any instance of dict.') # ====== check token_not_found ====== # if not is_number(token_not_found) and \ not is_string(token_not_found) and \ token_not_found not in ('ignore', 'raise'): raise ValueError('token_not_found can be: "ignore", "raise"' ', an integer of token index, or a string ' 'represented a token.') if token_not_found not in ('ignore', 'raise'): token_not_found = int(self.dictionary[token_not_found]) elif is_number(token_not_found): token_not_found = int(token_not_found) # ====== create embedding matrix ====== # ndim = len(next(vocabulary.values())) matrix = np.zeros(shape=(len(self.dictionary), ndim), dtype=dtype) for word, idx in self.dictionary.items(): if len(word) == 0: continue if word in vocabulary: matrix[idx, :] = vocabulary[word] elif token_not_found == 'raise': raise Exception('Cannot find token "%s" in the vocabulary.' % word) elif isinstance(token_not_found, int): matrix[idx, :] == matrix[token_not_found, :] return matrix
def embed(self, vocabulary, dtype='float32', token_not_found='ignore'): """Any word not found in the vocabulary will be set to all-zeros""" # ====== check vocab ======= # if not isinstance(vocabulary, Mapping): raise ValueError('"vocabulary" must be any instance of dict.') # ====== check token_not_found ====== # if not is_number(token_not_found) and \ not is_string(token_not_found) and \ token_not_found not in ('ignore', 'raise'): raise ValueError('token_not_found can be: "ignore", "raise"' ', an integer of token index, or a string ' 'represented a token.') if token_not_found not in ('ignore', 'raise'): token_not_found = int(self.dictionary[token_not_found]) elif is_number(token_not_found): token_not_found = int(token_not_found) # ====== create embedding matrix ====== # ndim = len(next(vocabulary.values())) matrix = np.zeros(shape=(len(self.dictionary), ndim), dtype=dtype) for word, idx in self.dictionary.items(): if len(word) == 0: continue if word in vocabulary: matrix[idx, :] = vocabulary[word] elif token_not_found == 'raise': raise Exception('Cannot find token "%s" in the vocabulary.' % word) elif isinstance(token_not_found, int): matrix[idx, :] == matrix[token_not_found, :] return matrix
def _check_shape(s): if hasattr(s, '__call__'): return s if is_number(s) or s is None: s = (s,) elif isinstance(s, np.ndarray): s = s.tolist() return tuple([int(i) if is_number(i) else None for i in s])
def _preprocessing_losses(losses, y_true, y_pred, inherit_losses=None, sample_weights=None): """ Can be used for both objectives and metrics """ from odin import backend as K # ====== special cases, only one inputs outputs, and multiple loss ====== # nb_losses = len(losses) if len(y_true) == 0: y_true = [None] * nb_losses elif len(y_true) == 1: y_true = y_true * nb_losses if len(y_pred) == 0: y_pred = [None] * nb_losses elif len(y_pred) == 1: y_pred = y_pred * nb_losses # ====== applying ====== # cost = [] for idx, fn in enumerate(as_tuple(losses)): weight = 1 kwargs = {} # preprocess if isinstance(fn, (tuple, list)): if len(fn) == 1: fn = fn[0] else: weight = [i for i in fn if is_number(i)] weight = 1 if len(weight) == 0 else weight[0] kwargs = [i for i in fn if isinstance(i, Mapping)] kwargs = {} if len(kwargs) == 0 else kwargs[0] fn = [i for i in fn if i != weight and i != kwargs][0] # apply the loss if is_number(fn): if inherit_losses is None or fn >= len(inherit_losses): raise ValueError("Cannot find losses at index: '%d'" % fn) obj = inherit_losses[fn] elif K.is_tensor(fn): obj = fn elif hasattr(fn, '__call__'): try: sign = inspect.signature(fn) if 'weights' in sign.parameters and sample_weights is not None: kwargs['weights'] = sample_weights except ValueError: pass finally: obj = fn(y_true[idx], y_pred[idx], **kwargs) if isinstance(obj, (tuple, list)): wprint( "function: '%s' return %d outputs (%s), only pick the first one" % (fn.__name__, len(obj), '; '.join([str(i) for i in obj]))) obj = obj[0] cost.append((weight, obj)) # ====== reduce ====== # return [c if w == 1 else w * c for w, c in cost]
def _preprocessing_losses(losses, y_true, y_pred, inherit_losses=None, sample_weights=None): """ Can be used for both objectives and metrics """ from odin import backend as K # ====== special cases, only one inputs outputs, and multiple loss ====== # nb_losses = len(losses) if len(y_true) == 0: y_true = [None] * nb_losses elif len(y_true) == 1: y_true = y_true * nb_losses if len(y_pred) == 0: y_pred = [None] * nb_losses elif len(y_pred) == 1: y_pred = y_pred * nb_losses # ====== applying ====== # cost = [] for idx, fn in enumerate(as_tuple(losses)): weight = 1 kwargs = {} # preprocess if isinstance(fn, (tuple, list)): if len(fn) == 1: fn = fn[0] else: weight = [i for i in fn if is_number(i)] weight = 1 if len(weight) == 0 else weight[0] kwargs = [i for i in fn if isinstance(i, Mapping)] kwargs = {} if len(kwargs) == 0 else kwargs[0] fn = [i for i in fn if i != weight and i != kwargs][0] # apply the loss if is_number(fn): if inherit_losses is None or fn >= len(inherit_losses): raise ValueError("Cannot find losses at index: '%d'" % fn) obj = inherit_losses[fn] elif K.is_tensor(fn): obj = fn elif hasattr(fn, '__call__'): try: sign = inspect.signature(fn) if 'weights' in sign.parameters and sample_weights is not None: kwargs['weights'] = sample_weights except ValueError: pass finally: obj = fn(y_true[idx], y_pred[idx], **kwargs) if isinstance(obj, (tuple, list)): wprint("function: '%s' return %d outputs (%s), only pick the first one" % (fn.__name__, len(obj), '; '.join([str(i) for i in obj]))) obj = obj[0] cost.append((weight, obj)) # ====== reduce ====== # return [c if w == 1 else w * c for w, c in cost]
def _apply(self, X): axes = self.axes ndims = X.shape.ndims if is_string(axes) and axes.lower() == 'auto': if ndims == 3: axes = (1,) elif ndims == 4: axes = (1, 2) elif ndims == 5: axes = (1, 2, 3) X = K.upsample(X, scale=self.size, axes=axes, method=self.mode) # ====== check desire_shape ====== # desire_shape = self.desire_shape if desire_shape is not None: desire_shape = [None if i is None or i < 0 else int(i) for i in desire_shape] # do padding if necessary paddings = [[0, 0] if i is None or o is None or i >= o else [tf.cast(tf.ceil((o - i) / 2), 'int32'), tf.cast(tf.floor((o - i) / 2), 'int32')] for i, o in zip(X.shape.as_list(), desire_shape)] if not all(i == [0, 0] for i in paddings): X = tf.pad(X, paddings=paddings, mode='CONSTANT') # do slice if necessary slices = [slice(tf.cast(tf.floor((i - o) / 2), 'int32'), tf.cast(-tf.ceil((i - o) / 2), 'int32'), None) if i is not None and o is not None and i > o else slice(None) for i, o in zip(X.shape.as_list(), desire_shape)] if any(s is not slice(None) for s in slices): X = X[slices] K.set_shape(X, tuple([i if is_number(i) else None for i in desire_shape])) return X
def shape(self): """ This is just an "UPPER" estimation, some data points might be lost during preprocessing each indices by recipes. """ # ====== first time calculate the shape ====== # if self._cache_shape is None or self._recipes_changed: # for each Descriptor, create list of pairs: (name, length) shapes_indices = [] for dat in self._data: indices = [] length = 0 for name in self.indices_keys: start, end = dat.indices[name] lng = end - start length += lng indices.append((name, lng)) # modify shapes by estimted length from indices shapes = (dat.shape,) if is_number(dat.shape[0]) \ else dat.shape # NOTE: the indices is copy for each shape (i.e. data), # hence, it will create some overhead in shape_transform for shp in [(length,) + shp[1:] for shp in shapes]: shapes_indices.append((shp, list(indices))) # Recipes shape_transform shapes = tuple([ shp for shp, ids in self._recipes.shape_transform(shapes_indices) ]) del shapes_indices self._cache_shape = tuple(shapes) self._recipes_changed = False # ====== get the cached shape ====== # if any(s[0] == 0 for s in self._cache_shape): raise RuntimeError("Feeder has `length=0` change the recipes to retain " "minimum of `length>=1`, shape: %s" % str(self._cache_shape)) return self._cache_shape
def __init__(self, slices, axis, data_idx=None): super(Slice, self).__init__() # ====== validate axis ====== # if not is_number(axis): raise ValueError('axis for Slice must be an integer.') self.axis = int(axis) # ====== validate indices ====== # if is_number(slices): slices = slice(int(slices), int(slices + 1)) elif isinstance(slices, (tuple, list)): slices = [i if isinstance(i, slice) else slice(int(i), int(i + 1)) for i in slices if isinstance(i, slice) or is_number(i)] elif not isinstance(slices, slice): raise ValueError('indices must be int, slice, or list of int and slice.') self.slices = slices # ====== validate target_data ====== # self.data_idx = data_idx
def _get_index(self, name): index = self.idx[name] if self.threshold is None: index elif hasattr(self.threshold, '__call__'): index = self.threshold(index) elif is_number(self.threshold): index = index >= float(self.threshold) if index.dtype != np.bool: index = index.astype('bool') return index
def confusion_matrix(y_true, y_pred, labels=None, normalize=False, name=None): """ Computes the confusion matrix of given vectors containing actual observations and predicted observations. Parameters ---------- y_true : 1-d or 2-d tensor variable true values y_pred : 1-d or 2-d tensor variable prediction values normalize : bool if True, normalize each row to [0., 1.] labels : array, shape = [nb_classes], int (nb_classes) List of labels to index the matrix. This may be used to reorder or select a subset of labels. If none is given, those that appear at least once in ``y_true`` or ``y_pred`` are used in sorted order. Note ---- if you want to calculate: Precision, Recall, F1 scores from the confusion matrix, set `normalize=False` """ with tf.name_scope(name, 'confusion_matrix', [y_true, y_pred]): nb_classes = None if y_true.shape.ndims == 2: nb_classes = y_true.shape.as_list()[-1] y_true = tf.argmax(y_true, -1) elif y_true.shape.ndims != 1: raise ValueError('actual must be 1-d or 2-d tensor variable') if y_pred.shape.ndims == 2: nb_classes = y_pred.shape.as_list()[-1] y_pred = tf.argmax(y_pred, -1) elif y_pred.shape.ndims != 1: raise ValueError('pred must be 1-d or 2-d tensor variable') # check valid labels if labels is None: if nb_classes is None: raise RuntimeError( "Cannot infer the number of classes for confusion matrix") labels = int(nb_classes) elif is_number(labels): labels = int(labels) elif hasattr(labels, '__len__'): labels = len(labels) # transpose to match the format of sklearn cm = tf_cm(labels=y_true, predictions=y_pred, num_classes=labels) if normalize: cm = tf.cast(cm, dtype='float32') cm = cm / tf.reduce_sum(cm, axis=1, keep_dims=True) return cm
def _check_label_mode(mode): if is_number(mode): return np.clip(float(mode), 0., 1.) if is_string(mode): mode = mode.lower() if mode == 'mid': mode = 'middle' if mode not in ('common', 'last', 'first', 'middle'): raise ValueError( "`label_mode` can be: 'common', 'last', 'first', 'middle'") return mode raise ValueError("No support for `label_mode`=%s" % str(mode))
def set_log_level(self, level): """ level: {int, bool} if `int`, log-level in integer (from 0 - 9) higher means more detail, -1 for turning off the log. if True, set the log-level to default: 2 """ if is_number(level): self._log_level = int(level) elif bool(level): self._log_level = 2 else: self._log_level = -1 return self
def _preprocessing_data(train, valid): from odin import fuel as F train = F.as_data(train) if is_number(valid): start_train = 0. end_train = 1. - valid start_valid = 1. - valid end_valid = 1. valid = F.DataGroup(train.data).set_batch(start=start_valid, end=end_valid) train = F.DataGroup(train.data).set_batch(start=start_train, end=end_train) elif valid is not None: valid = F.as_data(valid) return train, valid
def __init__(self, indices, axis, target_data=None): super(Slice, self).__init__() # ====== validate axis ====== # if not isinstance(axis, int): raise ValueError('axis for Slice must be an integer.') if axis == 0 and target_data is not None: raise ValueError("You can only apply Slice on axis=0 for all Data, " "(i.e. 'target_data' must be None when axis=0)") self.axis = axis # ====== validate indices ====== # if is_number(indices): indices = slice(int(indices), int(indices + 1)) elif isinstance(indices, (tuple, list)): indices = [i if isinstance(i, slice) else slice(int(i), int(i + 1)) for i in indices if isinstance(i, slice) or is_number(i)] elif not isinstance(indices, slice): raise ValueError('indices must be int, slice, or list of int and slice.') self.indices = indices # ====== validate target_data ====== # if target_data is not None and not isinstance(target_data, (tuple, list)): target_data = (target_data,) self._target_data = target_data
def _validate_shape_dtype(x): if not isinstance(x, tuple): return False if not len(x) == 2: return False shape, dtype = x # check shape if not isinstance(shape, tuple) and \ all(is_number(i) or isinstance(i, type(None)) for i in x): return False # check dtype if not is_string(dtype): return False return True
def __init__(self, lr, decay_steps=None, decay_rate=0.96, staircase=True, clipnorm=None, clipvalue=None, clip_alg='total_norm', name=None): if name is None: name = self.__class__.__name__ + '_' + str(uuid(length=4)) elif not isinstance(name, string_types): name = str(name) self._name = str(name) self.staircase = bool(staircase) with tf.variable_scope(self._name): self._lr = _as_variable(lr, name='learning_rate', roles=LearningRate) self._lr_decay = None self._step = tf.Variable(0., dtype=floatX, name="%s_step" % self.__class__.__name__) self.decay_steps = decay_steps self.decay_rate = decay_rate if clipnorm is not None: if (clipnorm if is_number(clipnorm) else get_value(clipnorm)) <= 0: raise ValueError('`clipnorm` value must greater than 0.') self.clipnorm = _as_variable(clipnorm, name="clip_norm", roles=GraidentsClippingNorm) if clipvalue is not None: if (clipvalue if is_number(clipvalue) else get_value(clipvalue)) <= 0: raise ValueError('`clipvalue` value must greater than 0.') self.clipvalue = _as_variable(clipvalue, name="clip_value", roles=GraidentsClippingValue) # ====== internal states values ====== # clip_alg = str(clip_alg).strip().lower() if clip_alg not in ('total_norm', 'norm', 'avg_norm'): raise ValueError("clip_arg must be one of the following: " "'norm', 'total_norm', 'avg_norm'") self._norm = 0. self.clip_alg = clip_alg self._algorithm = None self._is_initialized = False
def kl_gaussian(mu, logsigma, prior_mu=0., prior_logsigma=0.): """ KL-divergence between two gaussians. Useful for Variational AutoEncoders. Use this as an activation regularizer For taking kl_gaussian as variational regularization, you can take mean of the return matrix Parameters: ----------- mean, logsigma: parameters of the input distributions prior_mean, prior_logsigma: paramaters of the desired distribution (note the log on logsigma) Return ------ matrix: (n_samples, n_features) Note ---- origin implementation from: https://github.com/Philip-Bachman/ICML-2015/blob/master/LogPDFs.py Copyright (c) Philip Bachman """ if is_number(prior_mu): prior_mu = tf.convert_to_tensor(prior_mu, name='prior_mu', dtype=mu.dtype.base_dtype) if is_number(prior_logsigma): prior_logsigma = tf.convert_to_tensor(prior_logsigma, name='prior_logsigma', dtype=logsigma.dtype.base_dtype) gauss_klds = 0.5 * ( 2 * (prior_logsigma - logsigma) + (tf.exp(2 * logsigma) / tf.exp(2 * prior_logsigma)) + (tf.pow( (mu - prior_mu), 2.0) / tf.exp(2 * prior_logsigma)) - 1.0) return gauss_klds
def shape_transform(self, shapes): """ Parameters ---------- shapes: list of [(shape0, indices0), (shape1, indices1), ...] list of data shape tuple and indices, the indices is list of tuple (name, length) Return ------ new shape that transformed by this Recipe new indices """ for i in self._recipes: shapes = i.shape_transform(shapes) # ====== check returned ====== # if not all((isinstance(shp, (tuple, list)) and all(is_number(s) for s in shp) and is_string(ids[0][0]) and is_number(ids[0][1])) for shp, ids in shapes): raise RuntimeError("Returned `shapes` must be the list of pair " "`(shape, indices)`, where `indices` is the " "list of (name, length(int)).") return shapes
def shape_transform(self, shapes): """ Parameters ---------- shapes: list of [(shape0, indices0), (shape1, indices1), ...] list of data shape tuple and indices, the indices is list of tuple (name, length) Return ------ new shape that transformed by this Recipe new indices """ for i in self._recipes: shapes = i.shape_transform(shapes) # ====== check returned ====== # if not all((isinstance(shp, (tuple, list)) and all( is_number(s) for s in shp) and is_string(ids[0][0]) and is_number(ids[0][1])) for shp, ids in shapes): raise RuntimeError( "Returned `shapes` must be the list of pair " "`(shape, indices)`, where `indices` is the " "list of (name, length(int)).") return shapes
def _preprocess_prior_weights(y_true, prior_weights): if prior_weights is None: return None from odin import backend as K # ====== everything must be list ====== # if not isinstance(prior_weights, (tuple, list)): prior_weights = (prior_weights, ) elif is_number(prior_weights[0]): prior_weights = (prior_weights, ) # ====== matching indices and prior_weights ====== # pw = 0 for yt, w in zip(y_true, prior_weights): if w is not None: pw += K.to_sample_weights(indices=yt, weights=w) return pw
def _preprocessing_data(train, valid): from odin import fuel as F train = F.as_data(train) if is_number(valid): start_train = 0. end_train = 1. - valid start_valid = 1. - valid end_valid = 1. valid = F.DataGroup(train.data).set_batch(start=start_valid, end=end_valid) train = F.DataGroup(train.data).set_batch(start=start_train, end=end_train) elif valid is not None: valid = F.as_data(valid) return train, valid
def kl_gaussian(mu, logsigma, prior_mu=0., prior_logsigma=0.): """ KL-divergence between two gaussians. Useful for Variational AutoEncoders. Use this as an activation regularizer For taking kl_gaussian as variational regularization, you can take mean of the return matrix Parameters: ----------- mean, logsigma: parameters of the input distributions prior_mean, prior_logsigma: paramaters of the desired distribution (note the log on logsigma) Return ------ matrix: (n_samples, n_features) Note ---- origin implementation from: https://github.com/Philip-Bachman/ICML-2015/blob/master/LogPDFs.py Copyright (c) Philip Bachman """ if is_number(prior_mu): prior_mu = tf.convert_to_tensor(prior_mu, name='prior_mu', dtype=mu.dtype.base_dtype) if is_number(prior_logsigma): prior_logsigma = tf.convert_to_tensor( prior_logsigma, name='prior_logsigma', dtype=logsigma.dtype.base_dtype) gauss_klds = 0.5 * (2 * (prior_logsigma - logsigma) + (tf.exp(2 * logsigma) / tf.exp(2 * prior_logsigma)) + (tf.pow((mu - prior_mu), 2.0) / tf.exp(2 * prior_logsigma)) - 1.0) return gauss_klds
def _preprocess_prior_weights(y_true, prior_weights): if prior_weights is None: return None from odin import backend as K # ====== everything must be list ====== # if not isinstance(prior_weights, (tuple, list)): prior_weights = (prior_weights,) elif is_number(prior_weights[0]): prior_weights = (prior_weights,) # ====== matching indices and prior_weights ====== # pw = 0 for yt, w in zip(y_true, prior_weights): if w is not None: pw += K.to_sample_weights(indices=yt, weights=w) return pw
def get_arguments(): args = ArgController().add( "input", "Name of the dataset or path to csv file").add( "-n", "number of GMM components", 2).add("-idx", "index of the positive component", 1).add( "-norm", "method for normalizing: raw, log", 'log', ('log', 'raw')).add( "-outpath", "y_bin and y_prob will be saved to this path", '').add("-figpath", "path for saving analysis figure", '/tmp/tmp.pdf').add( "--verbose", "Enable verbose and saving diagnosis", False).parse() inp = str(args.input) if os.path.exists(inp): assert os.path.isfile(inp), "%s must be path to a file" % inp data = [] with open(inp, 'r') as f: for line in f: data.append(line.strip().split(',')) data = np.array(data) if all(is_number(i, string_number=True) for i in data[0]): y_prot = data.astype('float32') y_prot_names = np.array( ['#%d' % i for i in range(y_prot.shape[1])]) else: y_prot = data[1:].astype('float32') y_prot_names = data[0] outpath = args.outpath else: from sisua.data import get_dataset ds, gene_ds, prot_ds = get_dataset(inp, override=False) y_prot = ds['y'] y_prot_names = np.array(ds['y_col']) outpath = ds.path if args.outpath == '' else args.outpath return { 'y_prot': y_prot, 'y_prot_names': y_prot_names, 'n_components': int(args.n), 'index': int(args.idx), 'log_norm': True if args.norm == 'log' else False, 'outpath': outpath if len(outpath) > 0 else None, 'figpath': args.figpath if len(args.figpath) > 0 else None, 'verbose': bool(args.verbose) }
def __init__(self, idx, threshold=None, mvn=False, varnorm=True, data_idx=None, label_idx=()): super(Indexing, self).__init__() if not hasattr(idx, '__getitem__'): raise ValueError("`sad` must has attribute __getitem__ which takes " "file name as input and return array of index, same length as data.") if threshold is not None and \ not hasattr(threshold, '__call__') and \ not is_number(threshold): raise ValueError("`threshold` can be None, call-able, or number.") self.idx = idx self.threshold = threshold self.data_idx = data_idx self.label_idx = label_idx # ====== for normalization ====== # self.mvn = bool(mvn) self.varnorm = bool(varnorm)
def _apply_label_mode(y, mode): # This applying the label transform to 1-st axis if is_number(mode): n = y.shape[1] n = int(float(mode) * n) return y[:, n] if mode == 'common': raise NotImplementedError if mode == 'last': return y[:, -1] elif mode == 'first': return y[:, 0] elif mode == 'middle': n = y.shape[1] if n % 2 == 0: n //= 2 else: n = n // 2 + 1 return y[:, n] raise NotImplementedError("No support for label mode: '%s'" % mode)
def shape(self): """ This is just an "UPPER" estimation, some data points might be lost during preprocessing each indices by recipes. """ # ====== first time calculate the shape ====== # if self._cache_shape is None or self._recipes_changed: # for each Descriptor, create list of pairs: (name, length) shapes_indices = [] for dat in self._data: indices = [] length = 0 for name in self.indices_keys: start, end = dat.indices[name] lng = end - start length += lng indices.append((name, lng)) # modify shapes by estimted length from indices shapes = (dat.shape,) if is_number(dat.shape[0]) \ else dat.shape # NOTE: the indices is copy for each shape (i.e. data), # hence, it will create some overhead in shape_transform for shp in [(length, ) + shp[1:] for shp in shapes]: shapes_indices.append((shp, list(indices))) # Recipes shape_transform shapes = tuple([ shp for shp, ids in self._recipes.shape_transform(shapes_indices) ]) del shapes_indices self._cache_shape = tuple(shapes) self._recipes_changed = False # ====== get the cached shape ====== # if any(s[0] == 0 for s in self._cache_shape): raise RuntimeError( "Feeder has `length=0` change the recipes to retain " "minimum of `length>=1`, shape: %s" % str(self._cache_shape)) return self._cache_shape
def format_score(s): return ctext('%.4f' % s if is_number(s) else s, 'yellow')
def transform(self, texts, mode='seq', dtype='int32', padding='pre', truncating='pre', value=0., end_document=None, maxlen=None, token_not_found='ignore'): """ Parameters ---------- texts: iterator of unicode iterator, generator or list (e.g. [u'a', u'b', ...]) of unicode documents. mode: 'binary', 'tfidf', 'count', 'freq', 'seq' 'binary', abc 'tfidf', abc 'count', abc 'freq', abc 'seq', abc token_not_found: 'ignore', 'raise', a token string, an integer pass """ # ====== check arguments ====== # texts = self._validate_texts(texts) # ====== check mode ====== # mode = str(mode) if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'): raise ValueError('The "mode" argument must be: "seq", "binary", ' '"count", "freq", or "tfidf".') # ====== check token_not_found ====== # if not is_number(token_not_found) and \ not is_string(token_not_found) and \ token_not_found not in ('ignore', 'raise'): raise ValueError('token_not_found can be: "ignore", "raise"' ', an integer of token index, or a string ' 'represented a token.') if token_not_found not in ('ignore', 'raise'): token_not_found = int(self.dictionary[token_not_found]) elif is_number(token_not_found): token_not_found = int(token_not_found) # ====== pick engine ====== # if self.__engine == 'spacy': processor = self._preprocess_docs_spacy elif self.__engine == 'odin': processor = self._preprocess_docs_odin # ====== Initialize variables ====== # dictionary = self.dictionary results = [] # ====== preprocess arguments ====== # if isinstance(end_document, str): end_document = dictionary.index(end_document) elif is_number(end_document): end_document = int(end_document) # ====== processing ====== # if hasattr(texts, '__len__'): target_len = len(texts) auto_adjust_len = False else: target_len = 1234 auto_adjust_len = True prog = Progbar(target=target_len, name="Tokenize Transform", print_report=True, print_summary=True) for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True): # found the word in dictionary vec = [] for x in doc: idx = dictionary.get(x, -1) if idx >= 0: vec.append(idx) # not found the token in dictionary elif token_not_found == 'ignore': continue elif token_not_found == 'raise': raise RuntimeError( 'Cannot find token: "%s" in dictionary' % x) elif isinstance(token_not_found, int): vec.append(token_not_found) # append ending document token if end_document is not None: vec.append(end_document) # add the final results results.append(vec) # print progress if self.print_progress: prog['#Docs'] = nb_docs prog.add(1) if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target: prog.target = 1.2 * prog.target # end the process # if self.print_progress and auto_adjust_len: # prog.target = nb_docs; prog.update(nb_docs) # ====== pad the sequence ====== # # just transform into sequence of tokens if mode == 'seq': maxlen = self.longest_document_length if maxlen is None \ else int(maxlen) results = pad_sequences(results, maxlen=maxlen, dtype=dtype, padding=padding, truncating=truncating, value=value) # transform into one-hot matrix else: X = np.zeros(shape=(len(results), self.nb_words)) for i, seq in enumerate(results): if mode == 'binary': X[i, seq] = 1 elif mode == 'freq': length = len(seq) count = freqcount(seq) for tok, n in count.items(): X[i, tok] = n / float(length) elif mode == 'count': count = freqcount(seq) for tok, n in count.items(): X[i, tok] = n elif mode == 'tfidf': count = freqcount(seq) for tok, n in count.items(): tf = 1 + np.log(n) docs_freq = self._word_dictionary_info.get( tok, (0, 0))[-1] idf = np.log(1 + self.nb_docs / (1 + docs_freq)) X[i, tok] = tf * idf results = X return results
def fit(self, X, y=None, cv=None): self._initialize(X) if not hasattr(X, 'shape') or not hasattr(X, '__iter__') or \ not hasattr(X, '__len__'): raise ValueError( "`X` must has 'shape', '__len__' and '__iter__' attributes") nb_train_samples = len(X) # convert to odin.fuel.Data if possible if isinstance(X, (np.ndarray, list, tuple)): X = F.as_data(X) if isinstance(y, (np.ndarray, list, tuple)): y = F.as_data(y) start_tr = 0 end_tr = nb_train_samples # ====== check if cross validating ====== # create_it_cv = None if is_number(cv): cv = int(float(cv) * nb_train_samples) if cv < 1. else int(cv) end_tr = nb_train_samples - cv start_cv = end_tr end_cv = nb_train_samples nb_cv_samples = end_cv - start_cv create_it_cv = _create_it_func(X=X, y=y, batch_size=self.batch_size, start=start_cv, end=end_cv) elif isinstance(cv, (tuple, list)): X_cv, y_cv = cv nb_cv_samples = X_cv.shape[0] create_it_cv = _create_it_func(X=X_cv, y=y_cv, batch_size=self.batch_size, start=0, end=X_cv.shape[0]) elif hasattr(cv, 'set_batch'): nb_cv_samples = cv.shape[0] create_it_cv = _create_it_func(X=cv, y=None, batch_size=self.batch_size, start=0, end=cv.shape[0]) elif cv is not None: raise ValueError( '`cv` can be float (0-1), tuple or list of X and y, ' 'any object that have "shape" and "__iter__" attributes, ' 'or None') # ====== preprocessing ====== # create_it = _create_it_func(X=X, y=y, batch_size=self.batch_size, start=start_tr, end=end_tr) # ====== prepare ====== # curr_niter = sum(epoch[0] for epoch in self._train_history) curr_nepoch = len(self._train_history) curr_patience = int(self.patience) last_losses = None last_checkpoint = None best_epoch = None is_converged = False # ====== fitting ====== # while not is_converged: curr_nepoch += 1 seed = self._rand_state.randint(0, 10e8) # ====== training ====== # nb_iter, duration, results = _fitting_helper( create_it(seed), fn=self._f_train, nb_samples=nb_train_samples, nb_classes=self.nb_classes, title='Epoch %d' % curr_nepoch) curr_niter += nb_iter self._train_history.append( (nb_train_samples, nb_iter, duration, results)) # ====== cross validation ====== # if create_it_cv is not None: nb_iter, duration_valid, results = _fitting_helper( create_it_cv(seed), fn=self._f_score, nb_samples=nb_cv_samples, nb_classes=self.nb_classes, title="Validating") self._valid_history.append( (nb_train_samples, nb_iter, duration_valid, results)) duration += duration_valid # ====== print log ====== # if self.verbose >= 2: print( ctext('#epoch:', 'cyan') + str(curr_nepoch), ctext('#iter:', 'cyan') + str(curr_niter), ctext("Loss:", 'yellow') + '%.5f' % results[0], ctext("Acc:", 'yellow') + '%.3f' % results[1], ctext("%.2f(s)" % duration, 'magenta')) if self.confusion_matrix and (curr_nepoch - 1) % 8 == 0: print(V.print_confusion(results[-1], labels=self.labels)) # ====== early stopping ====== # losses = results[0] if last_checkpoint is None: # first check point last_checkpoint = self.parameters if last_losses is not None: # degraded, smaller is better if last_losses - losses <= self.tol: curr_patience -= 1 if self.rollback: if self.verbose >= 2: wprint( '[LogisticRegression] Rollback to the best checkpoint ' 'at epoch:%s patience:%s' % (ctext(best_epoch, 'cyan'), ctext(curr_patience, 'cyan'))) self.set_parameters(*last_checkpoint) # save best checkpoint else: last_checkpoint = self.parameters best_epoch = curr_nepoch if self._path is not None: with open(self._path, 'wb') as f: pickle.dump(self, f) last_losses = losses if curr_patience <= 0: is_converged = True # end the training if self.max_iter is not None and \ curr_niter >= self.max_iter: break if self.max_epoch is not None and \ curr_nepoch >= self.max_epoch: break # ====== print summary plot ====== # if self.verbose >= 1: train_losses = [epoch[-1][0] for epoch in self._train_history] print( V.print_bar(train_losses, height=12, bincount=min(20, len(train_losses)), title='Training Losses')) if create_it_cv is not None: valid_losses = [epoch[-1][0] for epoch in self._valid_history] print( V.print_bar(valid_losses, height=12, bincount=min(20, len(train_losses)), title='Validation Losses')) if self.confusion_matrix: print( ctext("======== Training Confusion Matrix ========", 'cyan')) print( V.print_confusion(arr=self._train_history[-1][-1][-1], labels=self.labels)) if create_it_cv is not None: print( ctext("======== Validation Confusion Matrix ========", 'cyan')) print( V.print_confusion(arr=self._valid_history[-1][-1][-1], labels=self.labels)) # ====== reset to best points ====== # self.set_parameters(*last_checkpoint) self._is_fitted = True if self._path is not None: with open(self._path, 'wb') as f: pickle.dump(self, f)
def __init__(self, nb_classes, l1=0., l2=0., fit_intercept=True, confusion_matrix=True, tol=1e-4, patience=3, rollback=True, batch_size=1024, max_epoch=100, max_iter=None, optimizer='adadelta', learning_rate=1.0, class_weight=None, dtype='float32', seed=5218, verbose=False, path=None, name=None): super(LogisticRegression, self).__init__() # ====== basic dimensions ====== # if isinstance(nb_classes, (tuple, list, np.ndarray)): self._labels = tuple([str(i) for i in nb_classes]) self._nb_classes = len(nb_classes) elif is_number(nb_classes): self._labels = tuple([str(i) for i in range(nb_classes)]) self._nb_classes = int(nb_classes) self._feat_dim = None self._dtype = np.dtype(dtype) # ====== preprocessing class weight ====== # if class_weight is None: class_weight = np.ones(shape=(self.nb_classes,), dtype=self.dtype) elif is_number(class_weight): class_weight = np.zeros(shape=(self.nb_classes,), dtype=self.dtype) + class_weight self._class_weight = class_weight # ====== flags ====== # self.l1 = float(l1) self.l2 = float(l2) self.fit_intercept = bool(fit_intercept) self.confusion_matrix = bool(confusion_matrix) # ====== internal states ====== # self._is_fitted = False # ====== others ====== # if name is None: name = uuid(length=8) self._name = 'LogisticRegression_%s' % name else: self._name = str(name) self._path = path # ====== training ====== # self.batch_size = int(batch_size) self.max_epoch = max_epoch self.max_iter = max_iter if not is_string(optimizer): raise ValueError("`optimizer` must be one of the following") optimizer = optimizer.lower() if optimizer not in _optimizer_list: raise ValueError("`optimizer` must be one of the following: %s" % str(list(_optimizer_list.keys()))) self._optimizer = _optimizer_list[optimizer.lower()](lr=float(learning_rate)) self._optimizer_name = optimizer self._optimizer_lr = learning_rate # ====== stop training ====== # self.tol = float(tol) self.patience = int(patience) self.rollback = bool(rollback) # ====== others ====== # self._train_history = [] self._valid_history = [] self._rand_state = np.random.RandomState(seed=int(seed)) self.verbose = int(verbose)
def __len__(self): """ len always return 1 number """ shape = self.shape if is_number(shape[0]): return shape[0] return self.shape[0][0]
def transform(self, texts, mode='seq', dtype='int32', padding='pre', truncating='pre', value=0., end_document=None, maxlen=None, token_not_found='ignore'): """ Parameters ---------- texts: iterator of unicode iterator, generator or list (e.g. [u'a', u'b', ...]) of unicode documents. mode: 'binary', 'tfidf', 'count', 'freq', 'seq' 'binary', abc 'tfidf', abc 'count', abc 'freq', abc 'seq', abc token_not_found: 'ignore', 'raise', a token string, an integer pass """ # ====== check arguments ====== # texts = self._validate_texts(texts) # ====== check mode ====== # mode = str(mode) if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'): raise ValueError('The "mode" argument must be: "seq", "binary", ' '"count", "freq", or "tfidf".') # ====== check token_not_found ====== # if not is_number(token_not_found) and \ not is_string(token_not_found) and \ token_not_found not in ('ignore', 'raise'): raise ValueError('token_not_found can be: "ignore", "raise"' ', an integer of token index, or a string ' 'represented a token.') if token_not_found not in ('ignore', 'raise'): token_not_found = int(self.dictionary[token_not_found]) elif is_number(token_not_found): token_not_found = int(token_not_found) # ====== pick engine ====== # if self.__engine == 'spacy': processor = self._preprocess_docs_spacy elif self.__engine == 'odin': processor = self._preprocess_docs_odin # ====== Initialize variables ====== # dictionary = self.dictionary results = [] # ====== preprocess arguments ====== # if isinstance(end_document, str): end_document = dictionary.index(end_document) elif is_number(end_document): end_document = int(end_document) # ====== processing ====== # if hasattr(texts, '__len__'): target_len = len(texts) auto_adjust_len = False else: target_len = 1208 auto_adjust_len = True prog = Progbar(target=target_len, name="Tokenize Transform", print_report=True, print_summary=True) for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True): # found the word in dictionary vec = [] for x in doc: idx = dictionary.get(x, -1) if idx >= 0: vec.append(idx) # not found the token in dictionary elif token_not_found == 'ignore': continue elif token_not_found == 'raise': raise RuntimeError('Cannot find token: "%s" in dictionary' % x) elif isinstance(token_not_found, int): vec.append(token_not_found) # append ending document token if end_document is not None: vec.append(end_document) # add the final results results.append(vec) # print progress if self.print_progress: prog['#Docs'] = nb_docs prog.add(1) if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target: prog.target = 1.2 * prog.target # end the process # if self.print_progress and auto_adjust_len: # prog.target = nb_docs; prog.update(nb_docs) # ====== pad the sequence ====== # # just transform into sequence of tokens if mode == 'seq': maxlen = self.longest_document_length if maxlen is None \ else int(maxlen) results = pad_sequences(results, maxlen=maxlen, dtype=dtype, padding=padding, truncating=truncating, value=value) # transform into one-hot matrix else: X = np.zeros(shape=(len(results), self.nb_words)) for i, seq in enumerate(results): if mode == 'binary': X[i, seq] = 1 elif mode == 'freq': length = len(seq) count = freqcount(seq) for tok, n in count.items(): X[i, tok] = n / float(length) elif mode == 'count': count = freqcount(seq) for tok, n in count.items(): X[i, tok] = n elif mode == 'tfidf': count = freqcount(seq) for tok, n in count.items(): tf = 1 + np.log(n) docs_freq = self._word_dictionary_info.get(tok, (0, 0))[-1] idf = np.log(1 + self.nb_docs / (1 + docs_freq)) X[i, tok] = tf * idf results = X return results
def _post_processing(self, X): X = X[:, -1] # remove timestamp if is_number(self.threshold): X = (X >= self.threshold).astype("bool") return {self.output_name: X}
def fit(self, X, y=None, cv=None): self._initialize(X) if not hasattr(X, 'shape') or not hasattr(X, '__iter__') or \ not hasattr(X, '__len__'): raise ValueError("`X` must has 'shape', '__len__' and '__iter__' attributes") nb_train_samples = len(X) # convert to odin.fuel.Data if possible if isinstance(X, (np.ndarray, list, tuple)): X = F.as_data(X) if isinstance(y, (np.ndarray, list, tuple)): y = F.as_data(y) start_tr = 0 end_tr = nb_train_samples # ====== check if cross validating ====== # create_it_cv = None if is_number(cv): cv = int(float(cv) * nb_train_samples) if cv < 1. else int(cv) end_tr = nb_train_samples - cv start_cv = end_tr end_cv = nb_train_samples nb_cv_samples = end_cv - start_cv create_it_cv = _create_it_func(X=X, y=y, batch_size=self.batch_size, start=start_cv, end=end_cv) elif isinstance(cv, (tuple, list)): X_cv, y_cv = cv nb_cv_samples = X_cv.shape[0] create_it_cv = _create_it_func(X=X_cv, y=y_cv, batch_size=self.batch_size, start=0, end=X_cv.shape[0]) elif hasattr(cv, 'set_batch'): nb_cv_samples = cv.shape[0] create_it_cv = _create_it_func(X=cv, y=None, batch_size=self.batch_size, start=0, end=cv.shape[0]) elif cv is not None: raise ValueError('`cv` can be float (0-1), tuple or list of X and y, ' 'any object that have "shape" and "__iter__" attributes, ' 'or None') # ====== preprocessing ====== # create_it = _create_it_func(X=X, y=y, batch_size=self.batch_size, start=start_tr, end=end_tr) # ====== prepare ====== # curr_niter = sum(epoch[0] for epoch in self._train_history) curr_nepoch = len(self._train_history) curr_patience = int(self.patience) last_losses = None last_checkpoint = None best_epoch = None is_converged = False # ====== fitting ====== # while not is_converged: curr_nepoch += 1 seed = self._rand_state.randint(0, 10e8) # ====== training ====== # nb_iter, duration, results = _fitting_helper(create_it(seed), fn=self._f_train, nb_samples=nb_train_samples, nb_classes=self.nb_classes, title='Epoch %d' % curr_nepoch) curr_niter += nb_iter self._train_history.append( (nb_train_samples, nb_iter, duration, results)) # ====== cross validation ====== # if create_it_cv is not None: nb_iter, duration_valid, results = _fitting_helper(create_it_cv(seed), fn=self._f_score, nb_samples=nb_cv_samples, nb_classes=self.nb_classes, title="Validating") self._valid_history.append( (nb_train_samples, nb_iter, duration_valid, results)) duration += duration_valid # ====== print log ====== # if self.verbose >= 2: print(ctext('#epoch:', 'cyan') + str(curr_nepoch), ctext('#iter:', 'cyan') + str(curr_niter), ctext("Loss:", 'yellow') + '%.5f' % results[0], ctext("Acc:", 'yellow') + '%.3f' % results[1], ctext("%.2f(s)" % duration, 'magenta')) if self.confusion_matrix and (curr_nepoch - 1) % 8 == 0: print(V.print_confusion(results[-1], labels=self.labels)) # ====== early stopping ====== # losses = results[0] if last_checkpoint is None: # first check point last_checkpoint = self.parameters if last_losses is not None: # degraded, smaller is better if last_losses - losses <= self.tol: curr_patience -= 1 if self.rollback: if self.verbose >= 2: wprint('[LogisticRegression] Rollback to the best checkpoint ' 'at epoch:%s patience:%s' % (ctext(best_epoch, 'cyan'), ctext(curr_patience, 'cyan'))) self.set_parameters(*last_checkpoint) # save best checkpoint else: last_checkpoint = self.parameters best_epoch = curr_nepoch if self._path is not None: with open(self._path, 'wb') as f: pickle.dump(self, f) last_losses = losses if curr_patience <= 0: is_converged = True # end the training if self.max_iter is not None and \ curr_niter >= self.max_iter: break if self.max_epoch is not None and \ curr_nepoch >= self.max_epoch: break # ====== print summary plot ====== # if self.verbose >= 1: train_losses = [epoch[-1][0] for epoch in self._train_history] print(V.print_bar(train_losses, height=12, bincount=min(20, len(train_losses)), title='Training Losses')) if create_it_cv is not None: valid_losses = [epoch[-1][0] for epoch in self._valid_history] print(V.print_bar(valid_losses, height=12, bincount=min(20, len(train_losses)), title='Validation Losses')) if self.confusion_matrix: print(ctext("======== Training Confusion Matrix ========", 'cyan')) print(V.print_confusion(arr=self._train_history[-1][-1][-1], labels=self.labels)) if create_it_cv is not None: print(ctext("======== Validation Confusion Matrix ========", 'cyan')) print(V.print_confusion(arr=self._valid_history[-1][-1][-1], labels=self.labels)) # ====== reset to best points ====== # self.set_parameters(*last_checkpoint) self._is_fitted = True if self._path is not None: with open(self._path, 'wb') as f: pickle.dump(self, f)
def upsample(x, scale, axes, method='nn', name=None): """ Parameters ---------- scale: int, list of int scaling up factor axes: int, list of int the axes of tensor which the upsampling method will be applied method: str, int 'nn' for nearest neighbor (e.g. [1, 2] => [1, 1, 2, 2]), 'pad' for padding within the tensor. 'pad_margin' do padding in the margin of the tensor. 'repeat' simple algorithm for repeating the element (e.g. [1, 2] => [1, 2, 1, 2]) """ with tf.name_scope(name, "Upsample"): method = method.lower() input_shape = tf.shape(x) input_shape_int = x.shape.as_list() ndims = x.shape.ndims # normalize all negative axes if axes is None: raise ValueError("axes cannot be None.") axes = [1, 2] if axes is None else \ [i % ndims for i in as_tuple(axes)] sorted(axes) # make scale a tuple scale = as_tuple(scale, N=len(axes), t=int) # mapping from axis -> scale scale_map = defaultdict(lambda: 1) scale_map.update([(i, j) for i, j in zip(axes, scale)]) # create final output_shape output_shape = [input_shape[i] * scale_map[i] for i in range(ndims)] # ====== Nearest neighbor method ====== # if method == 'nn': # tensorflow only support for tile <= 6-D tensor if ndims >= 6: raise ValueError( 'upsample with NN mode does not support rank >= 6 tensor.') elif ndims + len(axes) > 6: for a in axes: x = upsample(x, scale_map[a], axes=a, method='nn') else: # repeat the tensor x = dimshuffle(x, pattern=list(range(ndims)) + ['x'] * len(axes)) x = repeat(x, scale, axes=[i for i in range(ndims, ndims + len(axes))]) # transpose it back to the right shape axes_map = { i: j for i, j in zip(axes, range(ndims, ndims + len(axes))) } new_axes = [] for i in range(ndims): if i not in axes_map: new_axes.append(i) else: new_axes += [i, axes_map[i]] x = tf.transpose(x, perm=new_axes) x = reshape(x, output_shape) # ====== pading_margin ====== # elif method.lower() == 'pad_margin': paddings = [[0, 0] if i not in axes else [ tf.cast(tf.ceil(input_shape[i] * (scale_map[i] - 1) / 2), 'int32'), tf.cast(tf.floor(input_shape[i] * (scale_map[i] - 1) / 2), 'int32') ] for i in range(ndims)] x = tf.pad(x, paddings=paddings, mode='CONSTANT') # ====== pading ====== # elif method == 'pad': raise NotImplementedError # x = tf.scatter_nd(indices, x, shape=output_shape) # ====== repeat ====== # elif method == 'repeat': x = repeat(x, n=scale, axes=axes) # ====== no support ====== # else: raise ValueError("No support for method='%s'" % method) # ====== add_shape ====== # return set_shape(x, shape=[ s * scale_map[i] if is_number(s) else None for i, s in enumerate(input_shape_int) ])
def _initialize(self, X): # ====== check inputs dimensions ====== # if not hasattr(X, 'shape'): raise ValueError("`X` must have `shape` attribute.") feat_dim = np.prod(X.shape[1:]) if self._feat_dim is None: self._feat_dim = feat_dim # validate input dimension if feat_dim != self._feat_dim: raise RuntimeError("Feature dimension mismatch %d and %d" % (feat_dim, self.feat_dim)) # check if tensorflow op initalized if hasattr(self, '_f_train'): return # ====== binary or multi-classes ====== # if self.nb_classes == 2: out_shape = (None,) fn_activation = tf.nn.sigmoid fn_loss = tf.losses.sigmoid_cross_entropy fn_acc = K.metrics.binary_accuracy else: out_shape = (None, self.nb_classes) fn_activation = tf.nn.softmax fn_loss = tf.losses.softmax_cross_entropy fn_acc = K.metrics.categorical_accuracy # ====== create model ====== # with tf.name_scope(self.name, 'logistic_regression'): # inputs self._X = K.placeholder(shape=(None, self.feat_dim), dtype=self.dtype, name='%s_input' % self.name) self._y = K.placeholder(shape=out_shape, dtype=self.dtype, name='%s_output' % self.name) # check the bias if is_number(self.fit_intercept): b_init = float(self.fit_intercept) elif self.fit_intercept is False or \ self.fit_intercept is None: b_init = None else: b_init = self.fit_intercept # create the model and initialize with K.variable_dtype(dtype=self.dtype): self._model = N.Dense(num_units=self.nb_classes, W_init=init_ops.glorot_uniform_initializer(seed=self._rand_state.randint()), b_init=b_init, activation=K.linear) y_logits = self._model(self._X) y_prob = fn_activation(y_logits) # applying class weights class_weights = tf.constant(value=self._class_weight, dtype=self.dtype, name="class_weights") weights = tf.gather(class_weights, tf.cast(self._y, 'int32') if self.nb_classes == 2 else tf.argmax(self._y, axis=-1)) # optimizer params = [v for v in self._model.variables if has_roles(v, Weight) or has_roles(v, Bias)] losses = fn_loss(self._y, y_logits, weights=weights) l1_norm = tf.norm(self._model.get('W'), ord=1) if self.l1 > 0. else 0 l2_norm = tf.norm(self._model.get('W'), ord=2) if self.l2 > 0. else 0 losses = losses + self.l1 * l1_norm + self.l2 * l2_norm acc = fn_acc(self._y, y_prob) updates = self._optimizer.get_updates(losses, params) # create function if self.confusion_matrix: cm = K.metrics.confusion_matrix(y_true=self._y, y_pred=y_prob, labels=self.nb_classes) metrics = [losses, acc, cm] if self.confusion_matrix else [losses, acc] self._f_train = K.function(inputs=(self._X, self._y), outputs=metrics, updates=updates, training=True) self._f_score = K.function(inputs=(self._X, self._y), outputs=metrics, training=False) self._f_pred_prob = K.function(inputs=self._X, outputs=y_prob, training=False) self._f_pred_logit = K.function(inputs=self._X, outputs=y_logits, training=False) return self
def confusion_matrix(y_true, y_pred, labels=None, normalize=False, name=None): """ Computes the confusion matrix of given vectors containing actual observations and predicted observations. Parameters ---------- y_true : 1-d or 2-d tensor variable true values y_pred : 1-d or 2-d tensor variable prediction values normalize : bool if True, normalize each row to [0., 1.] labels : array, shape = [nb_classes], int (nb_classes) List of labels to index the matrix. This may be used to reorder or select a subset of labels. If none is given, those that appear at least once in ``y_true`` or ``y_pred`` are used in sorted order. Note ---- if you want to calculate: Precision, Recall, F1 scores from the confusion matrix, set `normalize=False` """ # ====== numpy ndarray ====== # if isinstance(y_true, np.ndarray) or isinstance(y_pred, np.ndarray): from sklearn.metrics import confusion_matrix as sk_cm nb_classes = None if y_true.ndim > 1: nb_classes = y_true.shape[1] y_true = np.argmax(y_true, axis=-1) if y_pred.ndim > 1: nb_classes = y_pred.shape[1] y_pred = np.argmax(y_pred, axis=-1) # get number of classes if labels is None: if nb_classes is None: raise RuntimeError("Cannot infer the number of classes for confusion matrix") labels = int(nb_classes) elif is_number(labels): labels = list(range(labels)) cm = sk_cm(y_true=y_true, y_pred=y_pred, labels=labels) if normalize: cm = cm.astype('float32') / np.sum(cm, axis=1, keepdims=True) return cm # ====== tensorflow tensor ====== # with tf.name_scope(name, 'confusion_matrix', [y_true, y_pred]): from tensorflow.contrib.metrics import confusion_matrix as tf_cm nb_classes = None if y_true.shape.ndims == 2: nb_classes = y_true.shape.as_list()[-1] y_true = tf.argmax(y_true, -1) elif y_true.shape.ndims != 1: raise ValueError('actual must be 1-d or 2-d tensor variable') if y_pred.shape.ndims == 2: nb_classes = y_pred.shape.as_list()[-1] y_pred = tf.argmax(y_pred, -1) elif y_pred.shape.ndims != 1: raise ValueError('pred must be 1-d or 2-d tensor variable') # check valid labels if labels is None: if nb_classes is None: raise RuntimeError("Cannot infer the number of classes for confusion matrix") labels = int(nb_classes) elif is_number(labels): labels = int(labels) elif hasattr(labels, '__len__'): labels = len(labels) # transpose to match the format of sklearn cm = tf_cm(labels=y_true, predictions=y_pred, num_classes=labels) if normalize: cm = tf.cast(cm, dtype='float32') cm = cm / tf.reduce_sum(cm, axis=1, keep_dims=True) return add_roles(cm, ConfusionMatrix)
def __init__(self, nb_classes, l1=0., l2=0., fit_intercept=True, confusion_matrix=True, tol=1e-4, patience=3, rollback=True, batch_size=1024, max_epoch=100, max_iter=None, optimizer='adadelta', learning_rate=1.0, class_weight=None, dtype='float32', seed=1234, verbose=False, path=None, name=None): super(LogisticRegression, self).__init__() # ====== basic dimensions ====== # if isinstance(nb_classes, (tuple, list, np.ndarray)): self._labels = tuple([str(i) for i in nb_classes]) self._nb_classes = len(nb_classes) elif is_number(nb_classes): self._labels = tuple([str(i) for i in range(nb_classes)]) self._nb_classes = int(nb_classes) self._feat_dim = None self._dtype = np.dtype(dtype) # ====== preprocessing class weight ====== # if class_weight is None: class_weight = np.ones(shape=(self.nb_classes, ), dtype=self.dtype) elif is_number(class_weight): class_weight = np.zeros(shape=(self.nb_classes, ), dtype=self.dtype) + class_weight self._class_weight = class_weight # ====== flags ====== # self.l1 = float(l1) self.l2 = float(l2) self.fit_intercept = bool(fit_intercept) self.confusion_matrix = bool(confusion_matrix) # ====== internal states ====== # self._is_fitted = False # ====== others ====== # if name is None: name = uuid(length=8) self._name = 'LogisticRegression_%s' % name else: self._name = str(name) self._path = path # ====== training ====== # self.batch_size = int(batch_size) self.max_epoch = max_epoch self.max_iter = max_iter if not is_string(optimizer): raise ValueError("`optimizer` must be one of the following") optimizer = optimizer.lower() if optimizer not in _optimizer_list: raise ValueError("`optimizer` must be one of the following: %s" % str(list(_optimizer_list.keys()))) self._optimizer = _optimizer_list[optimizer.lower()]( lr=float(learning_rate)) self._optimizer_name = optimizer self._optimizer_lr = learning_rate # ====== stop training ====== # self.tol = float(tol) self.patience = int(patience) self.rollback = bool(rollback) # ====== others ====== # self._train_history = [] self._valid_history = [] self._rand_state = np.random.RandomState(seed=int(seed)) self.verbose = int(verbose)
def format_score(s): return ctext('%.4f' % s if is_number(s) else s, 'yellow')
def _initialize(self, X): # ====== check inputs dimensions ====== # if not hasattr(X, 'shape'): raise ValueError("`X` must have `shape` attribute.") feat_dim = np.prod(X.shape[1:]) if self._feat_dim is None: self._feat_dim = feat_dim # validate input dimension if feat_dim != self._feat_dim: raise RuntimeError("Feature dimension mismatch %d and %d" % (feat_dim, self.feat_dim)) # check if tensorflow op initalized if hasattr(self, '_f_train'): return # ====== binary or multi-classes ====== # if self.nb_classes == 2: out_shape = (None, ) fn_activation = tf.nn.sigmoid fn_loss = tf.losses.sigmoid_cross_entropy fn_acc = K.metrics.binary_accuracy else: out_shape = (None, self.nb_classes) fn_activation = tf.nn.softmax fn_loss = tf.losses.softmax_cross_entropy fn_acc = K.metrics.categorical_accuracy # ====== create model ====== # with tf.name_scope(self.name, 'logistic_regression'): # inputs self._X = K.placeholder(shape=(None, self.feat_dim), dtype=self.dtype, name='%s_input' % self.name) self._y = K.placeholder(shape=out_shape, dtype=self.dtype, name='%s_output' % self.name) # check the bias if is_number(self.fit_intercept): b_init = float(self.fit_intercept) elif self.fit_intercept is False or \ self.fit_intercept is None: b_init = None else: b_init = self.fit_intercept # create the model and initialize with K.variable_dtype(dtype=self.dtype): self._model = N.Dense( num_units=self.nb_classes, W_init=init_ops.glorot_uniform_initializer( seed=self._rand_state.randint()), b_init=b_init, activation=K.linear) y_logits = self._model(self._X) y_prob = fn_activation(y_logits) # applying class weights class_weights = tf.constant(value=self._class_weight, dtype=self.dtype, name="class_weights") weights = tf.gather( class_weights, tf.cast(self._y, 'int32') if self.nb_classes == 2 else tf.argmax(self._y, axis=-1)) # optimizer params = [ v for v in self._model.variables if has_roles(v, Weight) or has_roles(v, Bias) ] losses = fn_loss(self._y, y_logits, weights=weights) l1_norm = tf.norm(self._model.get('W'), ord=1) if self.l1 > 0. else 0 l2_norm = tf.norm(self._model.get('W'), ord=2) if self.l2 > 0. else 0 losses = losses + self.l1 * l1_norm + self.l2 * l2_norm acc = fn_acc(self._y, y_prob) updates = self._optimizer.get_updates(losses, params) # create function if self.confusion_matrix: cm = K.metrics.confusion_matrix(y_true=self._y, y_pred=y_prob, labels=self.nb_classes) metrics = [losses, acc, cm ] if self.confusion_matrix else [losses, acc] self._f_train = K.function(inputs=(self._X, self._y), outputs=metrics, updates=updates, training=True) self._f_score = K.function(inputs=(self._X, self._y), outputs=metrics, training=False) self._f_pred_prob = K.function(inputs=self._X, outputs=y_prob, training=False) self._f_pred_logit = K.function(inputs=self._X, outputs=y_logits, training=False) return self
def standard_trainer(train_data, valid_data, X, y_train, y_score, y_target, parameters, test_data=None, cost_train=None, cost_score=None, optimizer=None, confusion_matrix=False, gradient_norm=True, save_path=None, save_obj=None, batch_size=64, nb_epoch=3, valid_freq=0.6, seed=1208, shuffle_level=2, patience=3, earlystop=5, report_path=None): """ Parameters ---------- cost_train: list of callable each function will be apply to a pair y_train and y_target Return ------ MainLoop, and History Note ---- """ from odin import backend as K # ====== prepare variables and cost ====== # # check optimizer if optimizer is None: optimizer = K.optimizers.SGD(lr=0.0001, momentum=0.9, nesterov=True) elif not isinstance(optimizer, K.optimizers.Optimizer) and \ not hasattr(optimizer, "get_updates"): raise ValueError( "Invalid optimizer, the optimizer must be instance of " "backend.optimizers.Optimizer or having function " "get_updates(self, loss_or_grads, params).") # check the cost functions if cost_train is None: cost_train = K.categorical_crossentropy if cost_score is None: cost_score = K.categorical_crossentropy cost_train = as_tuple(cost_train) cost_score = as_tuple(cost_score) # check input X, y, parameters X = as_tuple(X) y_train = as_tuple(y_train) y_score = as_tuple(y_score) y_target = as_tuple(y_target) parameters = as_tuple(parameters) if len(X) == 0 or len(y_train) == 0 or len(y_score) == 0 or \ len(y_target) == 0 or len(parameters) == 0: raise ValueError( "X(len=%d), y_train(len=%d), y_score(len=%d), y_target(len=%d)," "and parameters(len=%d) must be list or tuple with length > 0." % (len(X), len(y_train), len(y_score), len(y_target), len(parameters))) # get all cost if len(y_train) == 1: y_train = y_train * len(cost_train) if len(y_score) == 1: y_score = y_score * len(cost_score) cost_train = [ K.mean(f_cost(y_, y), axis=0) for f_cost, y_, y in zip( cost_train, y_train, y_target * len(cost_train) if len(y_target) == 1 else y_target) ] cost_score = [ K.mean(f_cost(y_, y), axis=0) for f_cost, y_, y in zip( cost_score, y_score, y_target * len(cost_score) if len(y_target) == 1 else y_target) ] # add confusion matrix if confusion_matrix: if not is_number(confusion_matrix) and \ not isinstance(confusion_matrix, (tuple, list, np.ndarray)): raise ValueError( "confusion_matrix must be an integer, or list, tuple" " specifies number of classes, or list of all classes.") if is_number(confusion_matrix): confusion_matrix = list(range(int(confusion_matrix))) for y_, y in zip(y_score, y_target): cost_score.append( K.confusion_matrix(y_pred=y_, y_true=y, labels=confusion_matrix)) # get the update updates = optimizer.get_updates(cost_train[0], parameters) # ====== create function ====== # grad_norm = [] if not gradient_norm or not hasattr(optimizer, 'norm') else \ [optimizer.norm] cost_train = cost_train + grad_norm print('Building training functions ...') f_train = K.function(inputs=X + y_target, outputs=cost_train, updates=updates) print('Building scoring functions ...') f_score = K.function(inputs=X + y_target, outputs=cost_score) # ====== Create trainer ====== # task = MainLoop(batch_size=batch_size, seed=seed, shuffle_level=shuffle_level) if save_path is not None and save_obj is not None: task.set_save(save_path, save_obj, save_hist=True) # set task task.set_task(f_train, train_data, epoch=nb_epoch, name='train') task.set_subtask(f_score, valid_data, freq=valid_freq, name='valid') if test_data is not None: task.set_subtask(f_score, test_data, when=-1, epoch=1, name='test') # format for score score_format = 'Results:' + __format_string( len(cost_score) - (1 if confusion_matrix else 0)) score_tracking = { (len(cost_score) - 1): lambda x: sum(x) } if confusion_matrix else [] # set the callback history = History() task.set_callback([ ProgressMonitor(name='train', format='Results:' + __format_string(len(cost_train))), ProgressMonitor(name='valid', format=score_format, tracking=score_tracking), (ProgressMonitor( name='test', format=score_format, tracking=score_tracking) if test_data is not None else None), history, EarlyStopGeneralizationLoss( 'valid', threshold=earlystop, patience=patience, get_value=lambda x: np.mean([i[0] for i in x] if isinstance(x[0], (tuple, list)) else x)), NaNDetector(('train', 'valid'), patience=patience, rollback=True) ]) return task, history