Exemple #1
0
    def run(function, queue, indices, rseed, *args):
        # randomize
        np_seed(rseed)
        py_seed(rseed)

        # compute and store results
        queue.put(dict(zip(indices, map(function, *args))))
Exemple #2
0
	def run(function, queue, indices, rseed, *args):
		# randomize
		np_seed(rseed)
		py_seed(rseed)

		# compute and store results
		queue.put(dict(zip(indices, map(function, *args))))
Exemple #3
0
def fix_seed(seed: int,
             deterministic: bool = False,
             benchmark: bool = False) -> None:
    """
    Fixed the ``Seed`` value of PyTorch, NumPy, Pure Python Random at once.

    Examples:
        >>> import torch
        >>> import numpy as np
        >>> fix_seed(0)
        >>> x = torch.randn(...)
        >>> y = np.random.randn(...)

    Args:
        seed (int): random state (sedd)
        deterministic (bool): Whether to ensure reproducibility as much as possible on CuDNN.
        benchmark (bool):

    Returns:
        None
    """
    std_seed(seed)
    os_environ["PYTHONHASHSEED"] = str(seed)
    np_seed(seed)
    torch.manual_seed(seed)

    if cuda_is_available():
        if deterministic:
            cudnn.deterministic = True
        if benchmark:
            cudnn.benchmark = False
Exemple #4
0
def SeedType(string):
    try:
        val = int(string)
        seed(val)
        np_seed(val)
        return val
    except ValueError:
        raise ArgumentTypeError("invalid seed '{0:s}'".format(string))
Exemple #5
0
def SeedType(string):
    try:
        val = int(string)
        seed(val)
        np_seed(val)
        return val
    except ValueError:
        raise ArgumentTypeError("invalid seed '{0:s}'".format(string))
Exemple #6
0
 def __enter__(self):
     self._state = getstate()
     if NUMPY:
         self._np_state = np_getstate()
     seed(self.seed)
     if NUMPY:
         np_seed(self.seed)
     return self
Exemple #7
0
 def __enter__(self):
     self._state = getstate()
     if NUMPY:
         self._np_state = np_getstate()
     seed(self.seed)
     if NUMPY:
         np_seed(self.seed)
     return self
Exemple #8
0
def lr_train(training_path, output_path, label_col, seed, scoring, flank_size,
             feature_dim, proximal, usegc, c_values, penalty_options, n_jobs,
             overwrite, verbose):
    """logistic regression training, validation, dumps optimal model"""
    if not seed:
        seed = int(time.time())

    np_seed(seed)
    LOGGER.log_args()
    LOGGER.log_versions(['sklearn', 'numpy'])

    os.makedirs(output_path, exist_ok=True)

    basename = get_basename(training_path)
    outpath = os.path.join(output_path, f"{basename}-classifier-lr.pkl.gz")
    if os.path.exists(outpath) and not overwrite:
        if verbose > 1:
            click.secho(
                f"Skipping. {outpath} exists. "
                "use overwrite to force.",
                fg='green')
        return

    logfile_path = os.path.join(output_path,
                                f"logs/{basename}-training-lr.log")
    LOGGER.log_file_path = logfile_path
    LOGGER.input_file(training_path)

    start_time = time.time()
    _, resp, feat, n_dims, names = data_to_numeric(training_path, label_col,
                                                   flank_size, feature_dim,
                                                   proximal, usegc)

    if usegc:
        # we need to scale the data
        scaler = get_scaler(feat)
        feat = scaler.transform(feat)
    classifier = logistic_regression(feat, resp, seed, scoring, c_values,
                                     penalty_options.split(","), n_jobs)
    betas = dict(zip(names, classifier.best_estimator_.coef_.tolist()[0]))
    result = dict(classifier=classifier.best_estimator_,
                  betas=betas,
                  scoring=scoring)
    result['feature_params'] = dict(feature_dim=feature_dim,
                                    flank_size=flank_size,
                                    proximal=proximal,
                                    usegc=usegc)
    if usegc:
        result['scaler'] = scaler

    with open(outpath, 'wb') as clf_file:
        pickle.dump(result, clf_file)

    LOGGER.output_file(outpath)
    duration = time.time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)")
    LOGGER.shutdown()
def get_features(tweetsTrain,
                 tweetsValidation,
                 tweetsTest,
                 embeddings_path,
                 random_state=None):
    """Classification with RNN and embedings (pre-trained)
    """

    #Offset = 2; Padding and OOV.
    print("Begin loading embeddings.")
    word_embeddings, word_emb_indices = read_embeddings(embeddings_path, 2)
    print("End loading embeddings.")
    np_seed(random_state)

    #Build vocabulary and corpus indexes
    print("Computing training tokens")
    vocabulary_train, corpus_train_index = fit_transform_vocabulary_pretrain_embeddings(
        tweetsTrain, word_emb_indices)

    # Set a max input length
    max_len_input = int(
        np.percentile([len(tweet_train) for tweet_train in corpus_train_index],
                      95,
                      axis=0))

    # Get the tokens for the validation and test sets
    print("Computing validation tokens")
    corpus_validation_index, oov_validation = get_tokens(
        tweetsValidation, word_emb_indices)
    print("OOV validation: %d" % oov_validation)
    #     print(np.unique([el for sub in corpus_validation_index for el in sub], return_counts=True))

    print("Computing test tokens")
    corpus_test_index, oov_test = get_tokens(tweetsTest, word_emb_indices)
    print("OOV test: %d" % oov_test)
    #     print(np.unique([el for sub in corpus_test_index for el in sub], return_counts=True))

    # Pad the train, validation and test token sequences
    train_features_pad = sequence.pad_sequences(corpus_train_index,
                                                maxlen=max_len_input,
                                                padding="post",
                                                truncating="post",
                                                dtype=type(
                                                    corpus_train_index[0][0]))
    validation_features_pad = sequence.pad_sequences(
        corpus_validation_index,
        maxlen=max_len_input,
        padding="post",
        truncating="post",
        dtype=type(corpus_validation_index[0][0]))
    test_features_pad = sequence.pad_sequences(corpus_test_index,
                                               maxlen=max_len_input,
                                               padding="post",
                                               truncating="post",
                                               dtype=type(
                                                   corpus_test_index[0][0]))

    return train_features_pad, validation_features_pad, test_features_pad, word_embeddings, word_emb_indices
Exemple #10
0
def xgboost_train(training_path, output_path, label_col, seed, flank_size,
                  feature_dim, proximal, usegc, strategy, n_jobs, overwrite,
                  verbose):
    """Naive Bayes training, validation, dumps optimal model"""
    if not seed:
        seed = int(time.time())

    np_seed(seed)
    LOGGER.log_args()
    LOGGER.log_versions(['sklearn', 'numpy'])
    os.makedirs(output_path, exist_ok=True)

    basename = get_basename(training_path)
    outpath = os.path.join(output_path, f"{basename}-classifier-xgb.pkl.gz")
    logfile_path = os.path.join(output_path,
                                f"logs/{basename}-training-xgb.log")
    if os.path.exists(outpath) and not overwrite:
        if verbose > 1:
            click.secho(
                f"Skipping. {outpath} exists. "
                "use overwrite to force.",
                fg='green')
        return

    LOGGER.log_file_path = logfile_path
    LOGGER.input_file(training_path)
    start_time = time.time()
    _, resp, feat, n_dims, names = data_to_numeric(training_path, label_col,
                                                   flank_size, feature_dim,
                                                   proximal, usegc)

    # hacking feature so all -1 > 0
    resp = [v if v > 0 else 0 for v in resp]

    if usegc:
        # we need to scale the data
        scaler = get_scaler(feat)
        feat = scaler.transform(feat)

    classifier = xgboost(feat, resp, seed, strategy, n_jobs, verbose)
    result = dict(classifier=classifier)
    result['feature_params'] = dict(feature_dim=feature_dim,
                                    flank_size=flank_size,
                                    proximal=proximal,
                                    usegc=usegc)
    if usegc:
        result['scaler'] = scaler

    with open(outpath, 'wb') as clf_file:
        pickle.dump(result, clf_file)

    LOGGER.output_file(outpath)
    duration = time.time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)")
    LOGGER.shutdown()
 def random(self, n, seed=None):
     if seed is not None:
         np_seed(seed)
     if self.isotropic:
         r = rand(n, 1) + 1
     else:
         r = rand(n, (self.dim * (self.dim + 1)) // 2) + 1
         r[:, self.dim:] /= 4
     
     return AtomElement(self, .8 * (2 * rand(n, self.dim) - 1), r, 2 * rand(n))
Exemple #12
0
def set_seeds():
    # set all random seeds
    import tensorflow as tf
    from numpy.random import seed as np_seed
    from random import seed as py_seed
    from snorkel.utils import set_seed as snork_seed
    snork_seed(123)
    tf.random.set_seed(123)
    np_seed(123)
    py_seed(123)
def make_results_reproducible() -> None:
    """ Makes results reproducible. """
    environ['TF_DETERMINISTIC_OPS'] = '1'
    environ['PYTHONHASHSEED'] = str(seed)
    np_seed(seed)
    rn_seed(seed)
    session_conf = ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    set_random_seed(seed)
    sess = Session(graph=get_default_graph(), config=session_conf)
    set_session(sess)
Exemple #14
0
 def __enter__(self):
     if self.seed is not None:
         self.rand_state = rand_get_state()
         self.np_state = np_get_state()
         self.torch_state = torch_get_state()
         self.torch_cudnn_deterministic = torch.backends.cudnn.deterministic
         rand_seed(self.seed)
         np_seed(self.seed)
         torch_seed(self.seed)
         torch.backends.cudnn.deterministic = True
     return self
Exemple #15
0
def set_seed(seed):
    from os import environ
    environ["PYTHONHASHSEED"] = '0'
    environ["CUDA_VISIBLE_DEVICES"] = '-1'
    environ["TF_CUDNN_USE_AUTOTUNE"] = '0'

    from numpy.random import seed as np_seed
    np_seed(seed)
    import random
    random.seed(seed)
    from tensorflow import set_random_seed
    set_random_seed(seed)
Exemple #16
0
def random_seed(i):
    """
    Set global random seed for all underlying components. Use 'brute-force' approach, by setting undelying libraries' seeds.

    Parameters
    ----------
        i: int
            integer used as seed for random number generators
    """
    # python's random module
    python_seed(i)
    # numpy random module
    np_seed(i)
Exemple #17
0
 def __init__(self, cfg_path):
     np_seed(0)
     seed(0)
     self.cfg_path = cfg_path
     cfg = Configuration(cfg_path).cfg
     # load data
     dataset = DataLoader(cfg, **cfg['dataset'])
     # fit models to data
     t = Trainer(**cfg['training'])
     self.val_results = t.run_models(dataset.data)
     # test data
     if 'holdout' in cfg:
         tester = Tester(**cfg['holdout'])
         tester.run(dataset.data.X_test, dataset.data.y_test)
Exemple #18
0
        def wrapper(random_seed: Number = None):
            # upper bound of Numpy seed is 2**32 - 1
            random_seed = random_seed if random_seed else randint(
                0, 2**32 - 1, dtype=int64)

            # convert to int so it can be serialized
            random_seed = int(random_seed)

            np_seed(random_seed)

            problem_content: GenProblemContent = fun()
            problem = GenProblem.from_content(problem_content, name,
                                              random_seed)

            return problem
Exemple #19
0
def set_seed(seed):

    from os import environ
    environ["PYTHONHASHSEED"] = '0'
    environ["CUDA_VISIBLE_DEVICES"] = '-1'
    environ["TF_CUDNN_USE_AUTOTUNE"] = '0'

    import tensorflow as tf

    session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
                                  inter_op_parallelism_threads=1)
    sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)

    from numpy.random import seed as np_seed
    np_seed(seed)
    import random
    random.seed(seed)

    tf.set_random_seed(seed)
Exemple #20
0
def gaussian_data(rho=0.5, sigma_1=1, sigma_2=1, mean_1=0, mean_2=0,
                  seed=1234):
    """Return bivariate normal data with correlation coefficient
    :param:`rho`, std deviations :param:`sigma_1`, :param:`sigma_2`,
    means :param:`mean_1`, :param:`mean_2`
    """

    from numpy.random import multivariate_normal
    from numpy.random import seed as np_seed

    np_seed(seed)

    cov = np.array([[float(sigma_1)**2,
                     float(rho)*float(sigma_1)*float(sigma_2)],
                    [float(rho)*float(sigma_1)*float(sigma_2),
                     float(sigma_2)**2]], dtype=float)

    means = np.array([float(mean_1), float(mean_2)], dtype=float)

    return multivariate_normal(means, cov, int(tr_size))
def read_embeddings(path, offset, random_state=42):
    """Load embeddings file.
    """
    word_embeddings = [[] for i in range(offset)]
    word_indexes = {}
    with open(path, "r", encoding="utf-8") as emb_file:
        emb_file.readline()
        for line in emb_file:
            fields = line.partition(EMB_SEP_CHAR)
            word = fields[0].strip()
            own_strip = str.strip
            emb_values = np_array(
                [float(x) for x in own_strip(fields[-1]).split(EMB_SEP_CHAR)])
            word_indexes[word] = len(word_embeddings)
            word_embeddings.append(emb_values)

    # Offset = 2; Padding and OOV.
    np_seed(random_state)
    word_embeddings[0] = 2 * 0.1 * np_rand(len(word_embeddings[2])) - 1
    word_embeddings[1] = 2 * 0.1 * np_rand(len(word_embeddings[2])) - 1

    return (word_embeddings, word_indexes)
Exemple #22
0
# Keras
from tensorflow.keras.backend import clear_session
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv1D, MaxPooling1D
from tensorflow.keras.layers import Input, Flatten, Dropout, Dense
from tensorflow.keras.layers import SimpleRNN, GRU, LSTM, Bidirectional
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.initializers import GlorotUniform, Orthogonal
from kerastuner.tuners import RandomSearch

# Seed
from random import seed
from numpy.random import seed as np_seed
from tensorflow.random import set_seed
seed(0)
np_seed(0)
set_seed(0)

# Helper
from .helpers import loss

class TimeSeries:
    def __init__(self):
        self.root = ttk.Frame()
        
        # Get Train Set
        get_train_set_frame = ttk.Labelframe(self.root, text="Get Train Set")
        get_train_set_frame.grid(column=0, row=0)

        file_path = tk.StringVar(value="")
        ttk.Label(get_train_set_frame, text="Train File Path").grid(column=0, row=0)
import yaml
from datasets import CombinedDataset, MyCocoDataset, CityScapeDataset
from config import Config
import model as modellib
from torch import manual_seed as cpu_seed
from torch.cuda import manual_seed_all as gpu_seed
from numpy.random import seed as np_seed

RANDOM_SEED = 20180705

np_seed(RANDOM_SEED)
cpu_seed(RANDOM_SEED)
gpu_seed(RANDOM_SEED)


def create_coco_cityscape_with_mapping(subset='val'):
    assert subset in ['train', 'val']
    combined = CombinedDataset()
    with open('id_mapping.yaml', 'r') as fp:
        id_maps = yaml.load(fp)
        coco_map = id_maps['COCO']
        cityscape_map = id_maps['CITYSCAPE']

    cityscape = CityScapeDataset('/home/fattouhm/datasets/cityscape/',
                                 subset,
                                 class_map=cityscape_map)
    combined.add_dataset('cityscape', cityscape)
    if subset == 'val':
        subset == 'minival'
    coco = MyCocoDataset('/home/fattouhm/datasets/coco',
                         subset=subset,
Exemple #24
0
from .util import *
from .datastructure import ActionError
from operator import itemgetter
import gzip, sys

PY3 = sys.version_info[0] > 2

if PY3:
    from pickle import load, dump
else:
    from cPickle import load, dump

from random import seed
from numpy.random import seed as np_seed
seed(42)
np_seed(42)


class ParsingModel(object):
    def __init__(self,
                 vocab=None,
                 idxlabelmap=None,
                 clf=None,
                 withdp=None,
                 fdpvocab=None,
                 fprojmat=None):
        """ Initialization
        
        :type vocab:
        :param vocab:
Exemple #25
0
    num_prediction_steps = args.num_prediction_steps

    dropbox_parameters = args.dropbox_parameters
    log_to_console = args.log_to_console

    #Initialize logging
    logging.initialize(__file__, log_to_console = log_to_console)
    logger = logging.get_logger(__name__)

    #Log input parameters
    logger.info('Running with parameters input_params: %s', input_params)
    logger.info('Additional parameters: image_generation_params: %s log_to_console: %s', image_generation_params, log_to_console)

    #Predictable randomness
    seed = 3
    np_seed(seed)
    tf_seed(seed)

    #Dropbox
    dropbox = None

    if dropbox_parameters:
        dropbox_params = DropboxConnection.Parameters(dropbox_parameters[0], dropbox_parameters[1])
        dropbox = DropboxConnection(dropbox_params)

        logger.info('Dropbox parameters:: dropbox_params: %s', dropbox_params)

    #Model file
    model_file = ModelInput(input_params.model_name)
    model_file_name = model_file.file_name(0, 0)
Exemple #26
0
"""Main experiment file."""

import warnings; warnings.filterwarnings("ignore")

# reproducibility bit ----------------
from random import seed; seed(42)
from numpy.random import seed as np_seed; np_seed(42)
from tensorflow.compat.v1 import set_random_seed; set_random_seed(42)
import os; os.environ['PYTHONHASHSEED'] = str(42)
# -----------------------------------

import argparse

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

from evaluation import Evaluation
from models import (BayesFeatures, BertFeatures, WordEmbeddings)
from reader import Reader, merge_datasets
from utils import debug_tests


class EnglishCompare(object):

    def __init__(self, pipeline: Pipeline, datasets: list = None,
                 merge: bool = False, cross: bool = True, neural: bool = False,
                 clean: bool = True, preprocess: bool = False,
                 multi_read: int = 0) -> None:
        # NOTE: comment out those unavailable, or provide own list
Exemple #27
0
# -*- coding: utf-8 -*-
from numpy.random import seed as np_seed
np_seed(9)
import random
random.seed(9)

import numpy
import pickle

import scipy
import os

def channel_last_reshape(im):
    by_column = im.reshape(3, 1024).T
    result = numpy.zeros((32,32,3))
    count = 0
    for i in range(32):
        for j in range(32):
            result[i,j] = by_column[count]
            count+=1
    return result

def write_polluted_with_cifar_100(train10_folder, base_name10, cifar100, max_size, alpha_list, base_alpha_folder):
    
# Code to generate cifar-10 polluted with cifar-100    
    '''
    1. Load 2 cifar 10 batches with labels and merge them.
    2. Load cifar 100 training.
    3. Select max_size from cifar 10 (12k)
    4. Build alpha_list. alpha = 0 IS ALREADY DONE!!!!!
    5. Reshape max_size first images from cifar 10 and cifar 100.
 def training(self):
     docs_labels = [self.__training_corpus.get_document(doc_id).sparse_label for doc_id in self.__training_corpus.corpus]
     np_seed(self.__random_seed)
     self.__classifier = SVC(kernel="linear")
     self.__classifier.fit(self.__features_training, docs_labels)
        :param timesteps: the amount of days to simulate"""
        for _ in range(timesteps):
            self.evolution.timestep()

            n_infected = self.evolution.get_n_infected()
            self.infected_over_time.append(n_infected)
            if n_infected == 0:
                break


if __name__ == "__main__":
    import matplotlib.pyplot as plt

    rd_seed(707)
    np_seed(1337)

    totals = []
    plt.subplot(121)
    for _ in range(10):
        population = Population(size=1000,
                                exp_household_size=2.3,
                                n_infected=2)
        evolution = Evolution(population=population,
                              movement_ratio=0.25,
                              contact_ratio=1,
                              infection_probability=0.25,
                              test_ratio=0.5)

        runner = Runner(evolution)
        runner.run(100)
Exemple #30
0
A script to create positive and negative samples using self-supervision.

"""

import os
import sys
import random
import pandas as pd
from numpy.random import seed as np_seed

from ustools.folder_utils import get_utterance_id, get_dir_info
from ultrasync.create_sync_samples_utils import create_samples, save_samples_to_disk

random.seed(2018)
np_seed(2018)


def mirror_folder_structure(input_path, output_path):
    """
    Function to create a mirror of the input path folder structure in output path.
    Adapted from https://stackoverflow.com/a/40829525/5190279

    :param input_path:
    :param output_path:
    :return: a list of pairs of core dir which contains files, and corresponding generated dir
    """

    folder_pairs = []

    for dirpath, dirnames, filenames in os.walk(input_path):
Exemple #31
0
 def _set_seed(random_seed):
     np_seed(random_seed)
Exemple #32
0
    def _fit(
            self,
            data,
            device,
            seed=None,
            verbose=False,
            normed=False,
            munkres_id=False,
            gamma=10,
            callback=None
    ):
        """
        Fit the mixture model to the data
        use get_results() to get the fitted model
        """
        points = data.copy().astype('double')
        if normed:
            self.data = points
            self.m = np.zeros(self.data.shape[1])
            self.s = np.ones(self.data.shape[1])
        else:
            self.m = points.mean(0)
            self.s = points.std(0)
            # in case any of the std's are zero
            if type(self.s) == np.float64:
                if self.s == 0:
                    self.s = 1
            else:
                self.s[self.s == 0] = 1
            self.data = (points - self.m) / self.s

        if len(self.data.shape) == 1:
            self.data = self.data.reshape((self.data.shape[0], 1))

        if len(self.data.shape) != 2:
            raise ValueError("points array is the wrong shape")
        self.n, self.d = self.data.shape

        if self._ref is not None:
            munkres_id = True
            self._load_ref_at_fit(points)

        if self.prior_mu is not None:
            self._load_mu_at_fit()
        if self.prior_sigma is not None:
            self._load_sigma_at_fit()

        if seed:
            np_seed(seed)
        else:
            np_seed(datetime.now().microsecond)

        # TODO move hyper-parameter settings here
        if self.model.lower() == 'bem':
            self.cdp = BEMNormalMixture(
                self.data,
                ncomp=self.n_clusters,
                gamma0=gamma,
                m0=self.m_0,
                nu0=self.nu_0,
                Phi0=self.phi_0,
                e0=self.e0,
                f0=self.f0,
                mu0=self._prior_mu,
                Sigma0=self._prior_sigma,
                weights0=self._prior_pi,
                alpha0=self.alpha_0,
                parallel=self.parallel,
                verbose=verbose
            )
            self.cdp.optimize(self.n_iterations, device=device)
        else:
            self.cdp = DPNormalMixture(
                self.data,
                ncomp=self.n_clusters,
                gamma0=gamma,
                m0=self.m_0,
                nu0=self.nu_0,
                Phi0=self.phi_0,
                e0=self.e0,
                f0=self.f0,
                mu0=self._prior_mu,
                Sigma0=self._prior_sigma,
                weights0=self._prior_pi,
                alpha0=self.alpha_0,
                parallel=self.parallel,
                verbose=verbose)
            self.cdp.sample(
                niter=self.n_iterations,
                nburn=self.burn_in,
                thin=1,
                ident=munkres_id,
                device=device,
                callback=callback
            )

        if self.model.lower() == 'bem':
            results = []
            for j in range(self.n_clusters):
                tmp = DPCluster(
                    self.cdp.weights[j],
                    (self.cdp.mu[j] * self.s) + self.m,
                    self.cdp.Sigma[j] * np.outer(self.s, self.s),
                    self.cdp.mu[j],
                    self.cdp.Sigma[j]
                )
                results.append(tmp)
            tmp = DPMixture(results, niter=1, m=self.m, s=self.s)
        else:
            results = []
            for i in range(self.n_iterations):
                for j in range(self.n_clusters):
                    tmp = DPCluster(
                        self.cdp.weights[i, j],
                        (self.cdp.mu[i, j] * self.s) + self.m,
                        self.cdp.Sigma[i, j] * np.outer(
                            self.s, self.s),
                        self.cdp.mu[i, j],
                        self.cdp.Sigma[i, j]
                    )
                    results.append(tmp)
            tmp = DPMixture(
                results,
                self.n_iterations,
                self.m,
                self.s,
                munkres_id)
        return tmp
Exemple #33
0
def sample_data(enu_path, germline_path, output_path, seed, train_size,
                enu_ratio, numreps, overwrite):
    """creates train/test sample data"""
    if seed is None:
        seed = int(time.time())
    LOGGER.log_args()
    LOGGER.log_versions(['sklearn', 'numpy'])

    # set the random number seed
    np_seed(seed)
    start_time = time.time()
    os.makedirs(output_path, exist_ok=True)
    logfile_path = os.path.join(output_path, "logs/data_sampling.log")
    if os.path.exists(logfile_path) and not overwrite:
        click.secho(f"Exists: {logfile_path}! use overwrite to force.",
                    fg='red')
        return

    LOGGER.log_file_path = logfile_path
    LOGGER.input_file(enu_path)
    LOGGER.input_file(germline_path)

    enu = pandas.read_csv(enu_path, sep="\t", header=0)
    germline = pandas.read_csv(germline_path, sep="\t", header=0)
    train_size = train_size // 2
    test_size = train_size
    train_enu_ratio, test_enu_ratio = enu_ratio
    enu_train_size, germ_train_size = get_enu_germline_sizes(
        train_size, train_enu_ratio)
    enu_test_size, germ_test_size = get_enu_germline_sizes(
        test_size, test_enu_ratio)
    assert min(enu_train_size, germ_train_size, enu_test_size,
               germ_test_size) > 0

    if (2 * train_size > enu.shape[0] or 2 * train_size > germline.shape[0]):
        print(f"ENU data set size: {enu.shape[0]}")
        print(f"Germline data set size: {germline.shape[0]}")
        print(f"Train set size: {train_size}")
        raise ValueError("2 x train size exceeds"
                         " size of training data source(s)")

    for rep in range(numreps):
        test_outpath = os.path.join(output_path, f"test-{rep}.tsv.gz")
        train_outpath = os.path.join(output_path, f"train-{rep}.tsv.gz")
        enu_training, enu_testing = train_test_split(enu,
                                                     test_size=enu_test_size,
                                                     train_size=enu_train_size)

        germ_training, germ_testing = train_test_split(
            germline, test_size=germ_test_size, train_size=germ_train_size)
        if any(
                map(lambda x: x.shape[0] == 0,
                    [enu_training, enu_testing, germ_training, germ_testing])):
            raise RuntimeError("screw up in creating test/train set")

        # concat the data frames
        testing = pandas.concat([enu_testing, germ_testing])
        training = pandas.concat([enu_training, germ_training])
        # write out, separately, the ENU and Germline data for train and test
        testing.to_csv(test_outpath, index=False, sep="\t", compression='gzip')
        training.to_csv(train_outpath,
                        index=False,
                        sep="\t",
                        compression='gzip')

        LOGGER.output_file(test_outpath)
        LOGGER.output_file(train_outpath)

    duration = time.time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)")
    LOGGER.shutdown()
Exemple #34
0
    def fit(
            self,
            data_sets,
            device,
            seed=None,
            verbose=False,
            munkres_id=False,
            tune_interval=100,
            initial_weights=None,
            gamma=10,
            callback=None
    ):
        self.d = data_sets[0].shape[1]

        data_sets = [i.copy().astype('double') for i in data_sets]
        self.n_data_sets = len(data_sets)
        total_data = np.vstack(data_sets)
        self.m = np.mean(total_data, 0)
        self.s = np.std(total_data, 0)
        standardized = []
        for i in data_sets:
            if i.shape[1] != self.d:
                raise RuntimeError("Shape of data sets do not match")
            standardized.append(((i - self.m) / self.s))

        if self.prior_mu is not None:
            self._load_mu_at_fit()
        if self.prior_sigma is not None:
            self._load_sigma_at_fit()
        if initial_weights is not None:
            if initial_weights.shape[0] != self.n_data_sets:
                raise ValueError(
                    "Initial weights do not match the number of data sets"
                )
            if initial_weights.shape[1] != self.n_clusters:
                raise ValueError(
                    "Initial weights do not match the number of components"
                )
            self._prior_pi = initial_weights

        if seed is not None:
            np_seed(seed)
        else:
            np_seed(datetime.now().microsecond)

        self.hdp = HDPNormalMixture(
            standardized,
            ncomp=self.n_clusters,
            gamma0=gamma,
            m0=self.m_0,
            nu0=self.nu_0,
            Phi0=self.phi_0,
            e0=self.e0,
            f0=self.f0,
            g0=self.g0,
            h0=self.h0,
            mu0=self._prior_mu,
            Sigma0=self._prior_sigma,
            weights0=self._prior_pi,
            alpha0=self.alpha_0,
            parallel=self.parallel,
            verbose=verbose)
        if not device:
            self.hdp.gpu = False
        self.hdp.sample(
            niter=self.n_iterations,
            nburn=self.burn_in,
            thin=1,
            ident=munkres_id,
            tune_interval=tune_interval,
            device=device,
            callback=callback
        )

        pis = np.array(
            [
                self.hdp.weights[-self.n_iterations:, k, :].flatten()
                for k in range(self.n_data_sets)
            ]
        )
        mus = (
            self.hdp.mu[-self.n_iterations:].reshape(
                self.n_clusters * self.n_iterations,
                self.d
            ) * self.s + self.m
        )
        sigmas = (
            self.hdp.Sigma[-self.n_iterations:].reshape(
                self.n_clusters * self.n_iterations,
                self.d,
                self.d
            ) * np.outer(self.s, self.s)
        )
        return HDPMixture(
            pis,
            mus,
            sigmas,
            self.n_iterations,
            self.m,
            self.s,
            munkres_id
        )
Exemple #35
0
def nb_train(training_path, output_path, label_col, seed, scoring, flank_size,
             feature_dim, proximal, usegc, alpha_options, class_prior, n_jobs,
             overwrite, verbose):
    """Naive Bayes training, validation, dumps optimal model"""
    if not seed:
        seed = int(time.time())

    np_seed(seed)
    LOGGER.log_args()
    LOGGER.log_versions(['sklearn', 'numpy'])
    os.makedirs(output_path, exist_ok=True)

    basename = get_basename(training_path)
    outpath = os.path.join(output_path, f"{basename}-classifier-nb.pkl.gz")
    logfile_path = os.path.join(output_path,
                                f"logs/{basename}-training-nb.log")
    if os.path.exists(outpath) and not overwrite:
        if verbose > 1:
            click.secho(
                f"Skipping. {outpath} exists. "
                "use overwrite to force.",
                fg='green')
        return

    LOGGER.log_file_path = logfile_path
    LOGGER.input_file(training_path)

    start_time = time.time()
    if class_prior is not None:
        class_labels = list(class_prior)
        encoded = transform_response(class_labels)
        ordered = sorted(zip(encoded, class_labels))
        class_prior = [class_prior[l] for _, l in ordered]

    _, resp, feat, n_dims, names = data_to_numeric(training_path, label_col,
                                                   flank_size, feature_dim,
                                                   proximal, usegc)

    if usegc:
        # we need to scale the data
        scaler = get_scaler(feat)
        feat = scaler.transform(feat)
    classifier = naive_bayes(feat,
                             resp,
                             seed,
                             alpha_options,
                             scoring,
                             class_prior=class_prior,
                             n_jobs=n_jobs)
    betas = dict(zip(names, classifier.best_estimator_.coef_.tolist()[0]))
    result = dict(classifier=classifier.best_estimator_,
                  betas=betas,
                  scoring=scoring)
    result['feature_params'] = dict(feature_dim=feature_dim,
                                    flank_size=flank_size,
                                    proximal=proximal,
                                    usegc=usegc)
    if usegc:
        result['scaler'] = scaler

    with open_(outpath, 'wb') as clf_file:
        pickle.dump(result, clf_file)

    LOGGER.output_file(outpath)
    duration = time.time() - start_time
    LOGGER.log_message("%.2f" % (duration / 60.),
                       label="run duration (minutes)")
    LOGGER.shutdown()
 def __init_random_seed(self):
     np_seed(self.__random_seed)
     tsf_set_random_seed(self.__random_seed)