def run(function, queue, indices, rseed, *args): # randomize np_seed(rseed) py_seed(rseed) # compute and store results queue.put(dict(zip(indices, map(function, *args))))
def fix_seed(seed: int, deterministic: bool = False, benchmark: bool = False) -> None: """ Fixed the ``Seed`` value of PyTorch, NumPy, Pure Python Random at once. Examples: >>> import torch >>> import numpy as np >>> fix_seed(0) >>> x = torch.randn(...) >>> y = np.random.randn(...) Args: seed (int): random state (sedd) deterministic (bool): Whether to ensure reproducibility as much as possible on CuDNN. benchmark (bool): Returns: None """ std_seed(seed) os_environ["PYTHONHASHSEED"] = str(seed) np_seed(seed) torch.manual_seed(seed) if cuda_is_available(): if deterministic: cudnn.deterministic = True if benchmark: cudnn.benchmark = False
def SeedType(string): try: val = int(string) seed(val) np_seed(val) return val except ValueError: raise ArgumentTypeError("invalid seed '{0:s}'".format(string))
def __enter__(self): self._state = getstate() if NUMPY: self._np_state = np_getstate() seed(self.seed) if NUMPY: np_seed(self.seed) return self
def lr_train(training_path, output_path, label_col, seed, scoring, flank_size, feature_dim, proximal, usegc, c_values, penalty_options, n_jobs, overwrite, verbose): """logistic regression training, validation, dumps optimal model""" if not seed: seed = int(time.time()) np_seed(seed) LOGGER.log_args() LOGGER.log_versions(['sklearn', 'numpy']) os.makedirs(output_path, exist_ok=True) basename = get_basename(training_path) outpath = os.path.join(output_path, f"{basename}-classifier-lr.pkl.gz") if os.path.exists(outpath) and not overwrite: if verbose > 1: click.secho( f"Skipping. {outpath} exists. " "use overwrite to force.", fg='green') return logfile_path = os.path.join(output_path, f"logs/{basename}-training-lr.log") LOGGER.log_file_path = logfile_path LOGGER.input_file(training_path) start_time = time.time() _, resp, feat, n_dims, names = data_to_numeric(training_path, label_col, flank_size, feature_dim, proximal, usegc) if usegc: # we need to scale the data scaler = get_scaler(feat) feat = scaler.transform(feat) classifier = logistic_regression(feat, resp, seed, scoring, c_values, penalty_options.split(","), n_jobs) betas = dict(zip(names, classifier.best_estimator_.coef_.tolist()[0])) result = dict(classifier=classifier.best_estimator_, betas=betas, scoring=scoring) result['feature_params'] = dict(feature_dim=feature_dim, flank_size=flank_size, proximal=proximal, usegc=usegc) if usegc: result['scaler'] = scaler with open(outpath, 'wb') as clf_file: pickle.dump(result, clf_file) LOGGER.output_file(outpath) duration = time.time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)") LOGGER.shutdown()
def get_features(tweetsTrain, tweetsValidation, tweetsTest, embeddings_path, random_state=None): """Classification with RNN and embedings (pre-trained) """ #Offset = 2; Padding and OOV. print("Begin loading embeddings.") word_embeddings, word_emb_indices = read_embeddings(embeddings_path, 2) print("End loading embeddings.") np_seed(random_state) #Build vocabulary and corpus indexes print("Computing training tokens") vocabulary_train, corpus_train_index = fit_transform_vocabulary_pretrain_embeddings( tweetsTrain, word_emb_indices) # Set a max input length max_len_input = int( np.percentile([len(tweet_train) for tweet_train in corpus_train_index], 95, axis=0)) # Get the tokens for the validation and test sets print("Computing validation tokens") corpus_validation_index, oov_validation = get_tokens( tweetsValidation, word_emb_indices) print("OOV validation: %d" % oov_validation) # print(np.unique([el for sub in corpus_validation_index for el in sub], return_counts=True)) print("Computing test tokens") corpus_test_index, oov_test = get_tokens(tweetsTest, word_emb_indices) print("OOV test: %d" % oov_test) # print(np.unique([el for sub in corpus_test_index for el in sub], return_counts=True)) # Pad the train, validation and test token sequences train_features_pad = sequence.pad_sequences(corpus_train_index, maxlen=max_len_input, padding="post", truncating="post", dtype=type( corpus_train_index[0][0])) validation_features_pad = sequence.pad_sequences( corpus_validation_index, maxlen=max_len_input, padding="post", truncating="post", dtype=type(corpus_validation_index[0][0])) test_features_pad = sequence.pad_sequences(corpus_test_index, maxlen=max_len_input, padding="post", truncating="post", dtype=type( corpus_test_index[0][0])) return train_features_pad, validation_features_pad, test_features_pad, word_embeddings, word_emb_indices
def xgboost_train(training_path, output_path, label_col, seed, flank_size, feature_dim, proximal, usegc, strategy, n_jobs, overwrite, verbose): """Naive Bayes training, validation, dumps optimal model""" if not seed: seed = int(time.time()) np_seed(seed) LOGGER.log_args() LOGGER.log_versions(['sklearn', 'numpy']) os.makedirs(output_path, exist_ok=True) basename = get_basename(training_path) outpath = os.path.join(output_path, f"{basename}-classifier-xgb.pkl.gz") logfile_path = os.path.join(output_path, f"logs/{basename}-training-xgb.log") if os.path.exists(outpath) and not overwrite: if verbose > 1: click.secho( f"Skipping. {outpath} exists. " "use overwrite to force.", fg='green') return LOGGER.log_file_path = logfile_path LOGGER.input_file(training_path) start_time = time.time() _, resp, feat, n_dims, names = data_to_numeric(training_path, label_col, flank_size, feature_dim, proximal, usegc) # hacking feature so all -1 > 0 resp = [v if v > 0 else 0 for v in resp] if usegc: # we need to scale the data scaler = get_scaler(feat) feat = scaler.transform(feat) classifier = xgboost(feat, resp, seed, strategy, n_jobs, verbose) result = dict(classifier=classifier) result['feature_params'] = dict(feature_dim=feature_dim, flank_size=flank_size, proximal=proximal, usegc=usegc) if usegc: result['scaler'] = scaler with open(outpath, 'wb') as clf_file: pickle.dump(result, clf_file) LOGGER.output_file(outpath) duration = time.time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)") LOGGER.shutdown()
def random(self, n, seed=None): if seed is not None: np_seed(seed) if self.isotropic: r = rand(n, 1) + 1 else: r = rand(n, (self.dim * (self.dim + 1)) // 2) + 1 r[:, self.dim:] /= 4 return AtomElement(self, .8 * (2 * rand(n, self.dim) - 1), r, 2 * rand(n))
def set_seeds(): # set all random seeds import tensorflow as tf from numpy.random import seed as np_seed from random import seed as py_seed from snorkel.utils import set_seed as snork_seed snork_seed(123) tf.random.set_seed(123) np_seed(123) py_seed(123)
def make_results_reproducible() -> None: """ Makes results reproducible. """ environ['TF_DETERMINISTIC_OPS'] = '1' environ['PYTHONHASHSEED'] = str(seed) np_seed(seed) rn_seed(seed) session_conf = ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) set_random_seed(seed) sess = Session(graph=get_default_graph(), config=session_conf) set_session(sess)
def __enter__(self): if self.seed is not None: self.rand_state = rand_get_state() self.np_state = np_get_state() self.torch_state = torch_get_state() self.torch_cudnn_deterministic = torch.backends.cudnn.deterministic rand_seed(self.seed) np_seed(self.seed) torch_seed(self.seed) torch.backends.cudnn.deterministic = True return self
def set_seed(seed): from os import environ environ["PYTHONHASHSEED"] = '0' environ["CUDA_VISIBLE_DEVICES"] = '-1' environ["TF_CUDNN_USE_AUTOTUNE"] = '0' from numpy.random import seed as np_seed np_seed(seed) import random random.seed(seed) from tensorflow import set_random_seed set_random_seed(seed)
def random_seed(i): """ Set global random seed for all underlying components. Use 'brute-force' approach, by setting undelying libraries' seeds. Parameters ---------- i: int integer used as seed for random number generators """ # python's random module python_seed(i) # numpy random module np_seed(i)
def __init__(self, cfg_path): np_seed(0) seed(0) self.cfg_path = cfg_path cfg = Configuration(cfg_path).cfg # load data dataset = DataLoader(cfg, **cfg['dataset']) # fit models to data t = Trainer(**cfg['training']) self.val_results = t.run_models(dataset.data) # test data if 'holdout' in cfg: tester = Tester(**cfg['holdout']) tester.run(dataset.data.X_test, dataset.data.y_test)
def wrapper(random_seed: Number = None): # upper bound of Numpy seed is 2**32 - 1 random_seed = random_seed if random_seed else randint( 0, 2**32 - 1, dtype=int64) # convert to int so it can be serialized random_seed = int(random_seed) np_seed(random_seed) problem_content: GenProblemContent = fun() problem = GenProblem.from_content(problem_content, name, random_seed) return problem
def set_seed(seed): from os import environ environ["PYTHONHASHSEED"] = '0' environ["CUDA_VISIBLE_DEVICES"] = '-1' environ["TF_CUDNN_USE_AUTOTUNE"] = '0' import tensorflow as tf session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) from numpy.random import seed as np_seed np_seed(seed) import random random.seed(seed) tf.set_random_seed(seed)
def gaussian_data(rho=0.5, sigma_1=1, sigma_2=1, mean_1=0, mean_2=0, seed=1234): """Return bivariate normal data with correlation coefficient :param:`rho`, std deviations :param:`sigma_1`, :param:`sigma_2`, means :param:`mean_1`, :param:`mean_2` """ from numpy.random import multivariate_normal from numpy.random import seed as np_seed np_seed(seed) cov = np.array([[float(sigma_1)**2, float(rho)*float(sigma_1)*float(sigma_2)], [float(rho)*float(sigma_1)*float(sigma_2), float(sigma_2)**2]], dtype=float) means = np.array([float(mean_1), float(mean_2)], dtype=float) return multivariate_normal(means, cov, int(tr_size))
def read_embeddings(path, offset, random_state=42): """Load embeddings file. """ word_embeddings = [[] for i in range(offset)] word_indexes = {} with open(path, "r", encoding="utf-8") as emb_file: emb_file.readline() for line in emb_file: fields = line.partition(EMB_SEP_CHAR) word = fields[0].strip() own_strip = str.strip emb_values = np_array( [float(x) for x in own_strip(fields[-1]).split(EMB_SEP_CHAR)]) word_indexes[word] = len(word_embeddings) word_embeddings.append(emb_values) # Offset = 2; Padding and OOV. np_seed(random_state) word_embeddings[0] = 2 * 0.1 * np_rand(len(word_embeddings[2])) - 1 word_embeddings[1] = 2 * 0.1 * np_rand(len(word_embeddings[2])) - 1 return (word_embeddings, word_indexes)
# Keras from tensorflow.keras.backend import clear_session from tensorflow.keras.models import Sequential, load_model from tensorflow.keras.layers import Conv1D, MaxPooling1D from tensorflow.keras.layers import Input, Flatten, Dropout, Dense from tensorflow.keras.layers import SimpleRNN, GRU, LSTM, Bidirectional from tensorflow.keras.optimizers import Adam, SGD, RMSprop from tensorflow.keras.initializers import GlorotUniform, Orthogonal from kerastuner.tuners import RandomSearch # Seed from random import seed from numpy.random import seed as np_seed from tensorflow.random import set_seed seed(0) np_seed(0) set_seed(0) # Helper from .helpers import loss class TimeSeries: def __init__(self): self.root = ttk.Frame() # Get Train Set get_train_set_frame = ttk.Labelframe(self.root, text="Get Train Set") get_train_set_frame.grid(column=0, row=0) file_path = tk.StringVar(value="") ttk.Label(get_train_set_frame, text="Train File Path").grid(column=0, row=0)
import yaml from datasets import CombinedDataset, MyCocoDataset, CityScapeDataset from config import Config import model as modellib from torch import manual_seed as cpu_seed from torch.cuda import manual_seed_all as gpu_seed from numpy.random import seed as np_seed RANDOM_SEED = 20180705 np_seed(RANDOM_SEED) cpu_seed(RANDOM_SEED) gpu_seed(RANDOM_SEED) def create_coco_cityscape_with_mapping(subset='val'): assert subset in ['train', 'val'] combined = CombinedDataset() with open('id_mapping.yaml', 'r') as fp: id_maps = yaml.load(fp) coco_map = id_maps['COCO'] cityscape_map = id_maps['CITYSCAPE'] cityscape = CityScapeDataset('/home/fattouhm/datasets/cityscape/', subset, class_map=cityscape_map) combined.add_dataset('cityscape', cityscape) if subset == 'val': subset == 'minival' coco = MyCocoDataset('/home/fattouhm/datasets/coco', subset=subset,
from .util import * from .datastructure import ActionError from operator import itemgetter import gzip, sys PY3 = sys.version_info[0] > 2 if PY3: from pickle import load, dump else: from cPickle import load, dump from random import seed from numpy.random import seed as np_seed seed(42) np_seed(42) class ParsingModel(object): def __init__(self, vocab=None, idxlabelmap=None, clf=None, withdp=None, fdpvocab=None, fprojmat=None): """ Initialization :type vocab: :param vocab:
num_prediction_steps = args.num_prediction_steps dropbox_parameters = args.dropbox_parameters log_to_console = args.log_to_console #Initialize logging logging.initialize(__file__, log_to_console = log_to_console) logger = logging.get_logger(__name__) #Log input parameters logger.info('Running with parameters input_params: %s', input_params) logger.info('Additional parameters: image_generation_params: %s log_to_console: %s', image_generation_params, log_to_console) #Predictable randomness seed = 3 np_seed(seed) tf_seed(seed) #Dropbox dropbox = None if dropbox_parameters: dropbox_params = DropboxConnection.Parameters(dropbox_parameters[0], dropbox_parameters[1]) dropbox = DropboxConnection(dropbox_params) logger.info('Dropbox parameters:: dropbox_params: %s', dropbox_params) #Model file model_file = ModelInput(input_params.model_name) model_file_name = model_file.file_name(0, 0)
"""Main experiment file.""" import warnings; warnings.filterwarnings("ignore") # reproducibility bit ---------------- from random import seed; seed(42) from numpy.random import seed as np_seed; np_seed(42) from tensorflow.compat.v1 import set_random_seed; set_random_seed(42) import os; os.environ['PYTHONHASHSEED'] = str(42) # ----------------------------------- import argparse from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC from sklearn.pipeline import Pipeline from evaluation import Evaluation from models import (BayesFeatures, BertFeatures, WordEmbeddings) from reader import Reader, merge_datasets from utils import debug_tests class EnglishCompare(object): def __init__(self, pipeline: Pipeline, datasets: list = None, merge: bool = False, cross: bool = True, neural: bool = False, clean: bool = True, preprocess: bool = False, multi_read: int = 0) -> None: # NOTE: comment out those unavailable, or provide own list
# -*- coding: utf-8 -*- from numpy.random import seed as np_seed np_seed(9) import random random.seed(9) import numpy import pickle import scipy import os def channel_last_reshape(im): by_column = im.reshape(3, 1024).T result = numpy.zeros((32,32,3)) count = 0 for i in range(32): for j in range(32): result[i,j] = by_column[count] count+=1 return result def write_polluted_with_cifar_100(train10_folder, base_name10, cifar100, max_size, alpha_list, base_alpha_folder): # Code to generate cifar-10 polluted with cifar-100 ''' 1. Load 2 cifar 10 batches with labels and merge them. 2. Load cifar 100 training. 3. Select max_size from cifar 10 (12k) 4. Build alpha_list. alpha = 0 IS ALREADY DONE!!!!! 5. Reshape max_size first images from cifar 10 and cifar 100.
def training(self): docs_labels = [self.__training_corpus.get_document(doc_id).sparse_label for doc_id in self.__training_corpus.corpus] np_seed(self.__random_seed) self.__classifier = SVC(kernel="linear") self.__classifier.fit(self.__features_training, docs_labels)
:param timesteps: the amount of days to simulate""" for _ in range(timesteps): self.evolution.timestep() n_infected = self.evolution.get_n_infected() self.infected_over_time.append(n_infected) if n_infected == 0: break if __name__ == "__main__": import matplotlib.pyplot as plt rd_seed(707) np_seed(1337) totals = [] plt.subplot(121) for _ in range(10): population = Population(size=1000, exp_household_size=2.3, n_infected=2) evolution = Evolution(population=population, movement_ratio=0.25, contact_ratio=1, infection_probability=0.25, test_ratio=0.5) runner = Runner(evolution) runner.run(100)
A script to create positive and negative samples using self-supervision. """ import os import sys import random import pandas as pd from numpy.random import seed as np_seed from ustools.folder_utils import get_utterance_id, get_dir_info from ultrasync.create_sync_samples_utils import create_samples, save_samples_to_disk random.seed(2018) np_seed(2018) def mirror_folder_structure(input_path, output_path): """ Function to create a mirror of the input path folder structure in output path. Adapted from https://stackoverflow.com/a/40829525/5190279 :param input_path: :param output_path: :return: a list of pairs of core dir which contains files, and corresponding generated dir """ folder_pairs = [] for dirpath, dirnames, filenames in os.walk(input_path):
def _set_seed(random_seed): np_seed(random_seed)
def _fit( self, data, device, seed=None, verbose=False, normed=False, munkres_id=False, gamma=10, callback=None ): """ Fit the mixture model to the data use get_results() to get the fitted model """ points = data.copy().astype('double') if normed: self.data = points self.m = np.zeros(self.data.shape[1]) self.s = np.ones(self.data.shape[1]) else: self.m = points.mean(0) self.s = points.std(0) # in case any of the std's are zero if type(self.s) == np.float64: if self.s == 0: self.s = 1 else: self.s[self.s == 0] = 1 self.data = (points - self.m) / self.s if len(self.data.shape) == 1: self.data = self.data.reshape((self.data.shape[0], 1)) if len(self.data.shape) != 2: raise ValueError("points array is the wrong shape") self.n, self.d = self.data.shape if self._ref is not None: munkres_id = True self._load_ref_at_fit(points) if self.prior_mu is not None: self._load_mu_at_fit() if self.prior_sigma is not None: self._load_sigma_at_fit() if seed: np_seed(seed) else: np_seed(datetime.now().microsecond) # TODO move hyper-parameter settings here if self.model.lower() == 'bem': self.cdp = BEMNormalMixture( self.data, ncomp=self.n_clusters, gamma0=gamma, m0=self.m_0, nu0=self.nu_0, Phi0=self.phi_0, e0=self.e0, f0=self.f0, mu0=self._prior_mu, Sigma0=self._prior_sigma, weights0=self._prior_pi, alpha0=self.alpha_0, parallel=self.parallel, verbose=verbose ) self.cdp.optimize(self.n_iterations, device=device) else: self.cdp = DPNormalMixture( self.data, ncomp=self.n_clusters, gamma0=gamma, m0=self.m_0, nu0=self.nu_0, Phi0=self.phi_0, e0=self.e0, f0=self.f0, mu0=self._prior_mu, Sigma0=self._prior_sigma, weights0=self._prior_pi, alpha0=self.alpha_0, parallel=self.parallel, verbose=verbose) self.cdp.sample( niter=self.n_iterations, nburn=self.burn_in, thin=1, ident=munkres_id, device=device, callback=callback ) if self.model.lower() == 'bem': results = [] for j in range(self.n_clusters): tmp = DPCluster( self.cdp.weights[j], (self.cdp.mu[j] * self.s) + self.m, self.cdp.Sigma[j] * np.outer(self.s, self.s), self.cdp.mu[j], self.cdp.Sigma[j] ) results.append(tmp) tmp = DPMixture(results, niter=1, m=self.m, s=self.s) else: results = [] for i in range(self.n_iterations): for j in range(self.n_clusters): tmp = DPCluster( self.cdp.weights[i, j], (self.cdp.mu[i, j] * self.s) + self.m, self.cdp.Sigma[i, j] * np.outer( self.s, self.s), self.cdp.mu[i, j], self.cdp.Sigma[i, j] ) results.append(tmp) tmp = DPMixture( results, self.n_iterations, self.m, self.s, munkres_id) return tmp
def sample_data(enu_path, germline_path, output_path, seed, train_size, enu_ratio, numreps, overwrite): """creates train/test sample data""" if seed is None: seed = int(time.time()) LOGGER.log_args() LOGGER.log_versions(['sklearn', 'numpy']) # set the random number seed np_seed(seed) start_time = time.time() os.makedirs(output_path, exist_ok=True) logfile_path = os.path.join(output_path, "logs/data_sampling.log") if os.path.exists(logfile_path) and not overwrite: click.secho(f"Exists: {logfile_path}! use overwrite to force.", fg='red') return LOGGER.log_file_path = logfile_path LOGGER.input_file(enu_path) LOGGER.input_file(germline_path) enu = pandas.read_csv(enu_path, sep="\t", header=0) germline = pandas.read_csv(germline_path, sep="\t", header=0) train_size = train_size // 2 test_size = train_size train_enu_ratio, test_enu_ratio = enu_ratio enu_train_size, germ_train_size = get_enu_germline_sizes( train_size, train_enu_ratio) enu_test_size, germ_test_size = get_enu_germline_sizes( test_size, test_enu_ratio) assert min(enu_train_size, germ_train_size, enu_test_size, germ_test_size) > 0 if (2 * train_size > enu.shape[0] or 2 * train_size > germline.shape[0]): print(f"ENU data set size: {enu.shape[0]}") print(f"Germline data set size: {germline.shape[0]}") print(f"Train set size: {train_size}") raise ValueError("2 x train size exceeds" " size of training data source(s)") for rep in range(numreps): test_outpath = os.path.join(output_path, f"test-{rep}.tsv.gz") train_outpath = os.path.join(output_path, f"train-{rep}.tsv.gz") enu_training, enu_testing = train_test_split(enu, test_size=enu_test_size, train_size=enu_train_size) germ_training, germ_testing = train_test_split( germline, test_size=germ_test_size, train_size=germ_train_size) if any( map(lambda x: x.shape[0] == 0, [enu_training, enu_testing, germ_training, germ_testing])): raise RuntimeError("screw up in creating test/train set") # concat the data frames testing = pandas.concat([enu_testing, germ_testing]) training = pandas.concat([enu_training, germ_training]) # write out, separately, the ENU and Germline data for train and test testing.to_csv(test_outpath, index=False, sep="\t", compression='gzip') training.to_csv(train_outpath, index=False, sep="\t", compression='gzip') LOGGER.output_file(test_outpath) LOGGER.output_file(train_outpath) duration = time.time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)") LOGGER.shutdown()
def fit( self, data_sets, device, seed=None, verbose=False, munkres_id=False, tune_interval=100, initial_weights=None, gamma=10, callback=None ): self.d = data_sets[0].shape[1] data_sets = [i.copy().astype('double') for i in data_sets] self.n_data_sets = len(data_sets) total_data = np.vstack(data_sets) self.m = np.mean(total_data, 0) self.s = np.std(total_data, 0) standardized = [] for i in data_sets: if i.shape[1] != self.d: raise RuntimeError("Shape of data sets do not match") standardized.append(((i - self.m) / self.s)) if self.prior_mu is not None: self._load_mu_at_fit() if self.prior_sigma is not None: self._load_sigma_at_fit() if initial_weights is not None: if initial_weights.shape[0] != self.n_data_sets: raise ValueError( "Initial weights do not match the number of data sets" ) if initial_weights.shape[1] != self.n_clusters: raise ValueError( "Initial weights do not match the number of components" ) self._prior_pi = initial_weights if seed is not None: np_seed(seed) else: np_seed(datetime.now().microsecond) self.hdp = HDPNormalMixture( standardized, ncomp=self.n_clusters, gamma0=gamma, m0=self.m_0, nu0=self.nu_0, Phi0=self.phi_0, e0=self.e0, f0=self.f0, g0=self.g0, h0=self.h0, mu0=self._prior_mu, Sigma0=self._prior_sigma, weights0=self._prior_pi, alpha0=self.alpha_0, parallel=self.parallel, verbose=verbose) if not device: self.hdp.gpu = False self.hdp.sample( niter=self.n_iterations, nburn=self.burn_in, thin=1, ident=munkres_id, tune_interval=tune_interval, device=device, callback=callback ) pis = np.array( [ self.hdp.weights[-self.n_iterations:, k, :].flatten() for k in range(self.n_data_sets) ] ) mus = ( self.hdp.mu[-self.n_iterations:].reshape( self.n_clusters * self.n_iterations, self.d ) * self.s + self.m ) sigmas = ( self.hdp.Sigma[-self.n_iterations:].reshape( self.n_clusters * self.n_iterations, self.d, self.d ) * np.outer(self.s, self.s) ) return HDPMixture( pis, mus, sigmas, self.n_iterations, self.m, self.s, munkres_id )
def nb_train(training_path, output_path, label_col, seed, scoring, flank_size, feature_dim, proximal, usegc, alpha_options, class_prior, n_jobs, overwrite, verbose): """Naive Bayes training, validation, dumps optimal model""" if not seed: seed = int(time.time()) np_seed(seed) LOGGER.log_args() LOGGER.log_versions(['sklearn', 'numpy']) os.makedirs(output_path, exist_ok=True) basename = get_basename(training_path) outpath = os.path.join(output_path, f"{basename}-classifier-nb.pkl.gz") logfile_path = os.path.join(output_path, f"logs/{basename}-training-nb.log") if os.path.exists(outpath) and not overwrite: if verbose > 1: click.secho( f"Skipping. {outpath} exists. " "use overwrite to force.", fg='green') return LOGGER.log_file_path = logfile_path LOGGER.input_file(training_path) start_time = time.time() if class_prior is not None: class_labels = list(class_prior) encoded = transform_response(class_labels) ordered = sorted(zip(encoded, class_labels)) class_prior = [class_prior[l] for _, l in ordered] _, resp, feat, n_dims, names = data_to_numeric(training_path, label_col, flank_size, feature_dim, proximal, usegc) if usegc: # we need to scale the data scaler = get_scaler(feat) feat = scaler.transform(feat) classifier = naive_bayes(feat, resp, seed, alpha_options, scoring, class_prior=class_prior, n_jobs=n_jobs) betas = dict(zip(names, classifier.best_estimator_.coef_.tolist()[0])) result = dict(classifier=classifier.best_estimator_, betas=betas, scoring=scoring) result['feature_params'] = dict(feature_dim=feature_dim, flank_size=flank_size, proximal=proximal, usegc=usegc) if usegc: result['scaler'] = scaler with open_(outpath, 'wb') as clf_file: pickle.dump(result, clf_file) LOGGER.output_file(outpath) duration = time.time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)") LOGGER.shutdown()
def __init_random_seed(self): np_seed(self.__random_seed) tsf_set_random_seed(self.__random_seed)