コード例 #1
0
os.environ['ODIN'] = 'float32,gpu'

import numpy as np

from odin import backend as K, nnet as N, visual as V
from odin import preprocessing as pp
from odin.utils import (args_parse, stdio,
                        get_module_from_path, get_script_path)
from odin.utils.mpi import cpu_count

from utils import (WAV_FILES, SAMPLED_WAV_FILE,
                   PATH_ACOUSTIC_FEAT, PATH_EXP)
# ===========================================================================
# Config
# ===========================================================================
stdio(os.path.join(PATH_EXP, 'features_extraction.log'))
args = args_parse(descriptions=[
    ('recipe', 'the name of function defined in feature_recipes.py', None),
    ('--debug', 'enable debug or not', None, False)
])
DEBUG = args.debug
# ===========================================================================
# Create the recipes
# ===========================================================================
extractor = get_module_from_path(identifier=str(args.recipe),
                                 prefix='feature_recipes',
                                 path=get_script_path())
assert len(extractor) > 0, \
"Cannot find any recipe with name: '%s' from path: '%s'" % (args.recipe, get_script_path())
recipe = extractor[0](DEBUG)
# ====== debugging ====== #
コード例 #2
0
ファイル: cifar10.py プロジェクト: johndpope/odin-ai
).add('-model', 'model name, specified in models_cifar.py', 'cnn'
).parse()

import os
os.environ['ODIN'] = 'float32,gpu,seed=87654321,log'

import numpy as np
import tensorflow as tf

from odin import fuel as F, nnet as N, backend as K, training, utils
from odin.stats import train_valid_test_split

MODEL_NAME = args.model
MODEL_PATH = utils.get_modelpath(name='cifar10_%s' % MODEL_NAME, override=True)
LOG_PATH = utils.get_logpath(name='cifar10_%s.log' % MODEL_NAME, override=True)
stdio(LOG_PATH)
# ===========================================================================
# Some handmade constants
# ===========================================================================
NB_EPOCH = 10
LEARNING_RATE = 0.001
# ===========================================================================
# Load dataset
# ===========================================================================
ds = F.CIFAR10.get_dataset()
nb_labels = 10
print(ds)
X_train = ds['X_train'][:].astype('float32') / 255.
y_train = one_hot(ds['y_train'][:], nb_classes=nb_labels)
X_test = ds['X_test'][:].astype('float32') / 255.
y_test = one_hot(ds['y_test'][:], nb_classes=nb_labels)
コード例 #3
0
                                                  False).parse()
no_train = flags.no_train
no_score = flags.no_score
analyze = flags.analyze
# assume the scores were ready when analyze is enable
if analyze:
  no_train = True
  no_score = True

# ===========================================================================
# Configurations
# ===========================================================================
path = '/tmp/grid'
if not os.path.exists(path):
  os.mkdir(path)
stdio(os.path.join(path, 'log.txt'))

gene = x_train.shape[1]
prot = y_train.shape[1]
epochs = 200
batch_size = 128
ncpu = 4

# ===========================================================================
# Generate all jobs
# ===========================================================================
jobs = []
for nlayers in [1, 2, 3]:
  for hdim in [32, 128, 512]:
    for zdim in [16, 32, 64]:
      for model in [
コード例 #4
0
ファイル: processor.py プロジェクト: professorlust/odin-ai
def validate_features(ds_or_processor,
                      path,
                      nb_samples=25,
                      override=False,
                      seed=12082518,
                      fig_width=4):
    # TODO: add PCA visualization
    # TODO: update to match new indices style
    def logger(title, tag, check):
        check = bool(check)
        text_color = 'yellow' if check else 'red'
        print(ctext('   *', 'cyan'), ctext(str(title), text_color),
              ctext(str(tag), 'magenta'),
              ctext("✓", text_color) if check else ctext("✗", text_color))

    import matplotlib
    matplotlib.use('Agg')
    from odin.visual import plot_save, plot_multiple_features
    # ====== check path to dataset ====== #
    should_close_ds = True
    if isinstance(ds_or_processor, FeatureProcessor):
        ds = Dataset(ds_or_processor.path, read_only=True)
    elif is_string(ds_or_processor):
        ds = Dataset(ds_or_processor, read_only=True)
    elif isinstance(ds_or_processor, Dataset):
        ds = ds_or_processor
        should_close_ds = False
    else:
        raise ValueError("`ds` can be None, string, or Dataset. No "
                         "support for given input type: %s" % str(type(ds)))
    print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path)
    # ====== extract the config of the dataset ====== #
    if 'config' not in ds:
        raise RuntimeError(
            "The `Dataset` must be generated by `FeatureProcessor` "
            "which must contain `config` MmapDict of extracted "
            "features configuration.")
    # config = ds['config']
    # pipeline = ds['pipeline']
    # ====== output path ====== #
    path = str(path)
    if not os.path.exists(path):
        os.mkdir(path)
    elif override:
        if os.path.isfile(path):
            os.remove(path)
        else:
            shutil.rmtree(path)
        os.mkdir(path)
    else:
        raise ValueError("`path`=%s exists, cannot override." % path)
    prev_stdio = get_stdio_path()
    stdio(path=os.path.join(path, 'log.txt'))
    nb_samples = int(nb_samples)
    # ====== get all features ====== #
    # [(name, dtype, statistic-able), ...]
    all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')]
    # store all features (included the features in external_indices
    all_features = []
    # the external indices can be: indices_mfcc_bnf
    external_indices = flatten_list([
        k.split('_')[1:] for k in all_keys if 'indices' in k and k != 'indices'
    ])
    # ====== checking indices ====== #
    main_indices = {
        name: (start, end)
        for name, (start, end) in ds['indices'].items()
    }
    for ids_name in (k for k in all_keys if 'indices' in k):
        ids = sorted([(name, start, end)
                      for name, (start, end) in ds[ids_name].items()],
                     key=lambda x: x[1])
        for prev, now in zip(ids, ids[1:]):
            assert prev[2] == now[1], "Zero length in indices"
            assert prev[2] - prev[1] > 0, "Zero length in indices"
            assert now[2] - now[1] > 0, "Zero length in indices"
        # final length match length of Data
        if ids_name != 'indices':
            for feat_name in ids_name.split('_')[1:]:
                assert now[-1] == len(ds[feat_name]), \
                    "Indices and data length mismatch, indices:'%s' feat:'%s'" % \
                    (ids_name, feat_name)
                all_features.append(feat_name)
        else:
            for feat_name in all_keys:
                if feat_name not in external_indices and \
                'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \
                'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \
                isinstance(ds[feat_name], MmapData):
                    assert now[-1] == len(ds[feat_name]), \
                    "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name
                    all_features.append(feat_name)
        # logging
        logger("Checked all:", ids_name, True)
    # ====== check all dictionary types ====== #
    for name in all_keys:
        if isinstance(ds[name], MmapDict) and 'indices' not in name:
            data = ds[name]
            # special cases
            if name == 'sr':
                checking_func = lambda x: x > 0  # for sr
            else:
                checking_func = lambda x: True
            # check
            for key, val in data.items():
                assert key in main_indices, \
                "Dictionary with name:'%s' has key not found in indices." % name
                assert checking_func(val)
            logger("Checked dictionary: ", name, True)
    # ====== checking each type of data ====== #
    # get all stats name
    all_stats = defaultdict(list)
    for k in all_keys:
        if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \
        'mean' == k[-4:] or 'std' == k[-3:]:
            all_stats[k[:-4].split('_')[0]].append(k)
    # get all pca name
    all_pca = {i: i + '_pca' for i in all_features if i + '_pca' in ds}
    # checking one-by-one numpy.ndarray features array
    for feat_name in all_features:
        dtype = str(ds[feat_name].dtype)
        # checking all data
        indices = ds.find_prefix(feat_name, 'indices')
        prog = Progbar(target=len(indices),
                       interval=0.1,
                       print_report=True,
                       name='Checking: %s(%s)' % (feat_name, dtype))
        # start iterating over all data file
        fail_test = False
        for file_name, (start, end) in indices:
            dat = ds[feat_name][start:end]
            # No NaN value
            if np.any(np.isnan(dat)):
                logger("NaN values", file_name + ':' + feat_name, False)
                fail_test = True
            # not all value closed to zeros
            if np.all(np.isclose(dat, 0.)):
                logger("All-closed-zeros values", file_name + ':' + feat_name,
                       False)
                fail_test = True
            prog['Name'] = file_name
            prog.add(1)
        if not fail_test:
            logger("Check data incredibility for: ", feat_name, True)
        # checking statistics
        if feat_name in all_stats:
            fail_test = False
            for stat_name in all_stats[feat_name]:
                X = ds[stat_name]
                if X.ndim >= 1:
                    X = X[:]
                if np.any(np.isnan(X)):
                    logger("NaN values", feat_name + ':' + stat_name, False)
                    fail_test = True
                if np.all(np.isclose(X, 0.)):
                    logger("All-closed-zeros values",
                           feat_name + ':' + stat_name, False)
                    fail_test = True
            if not fail_test:
                logger("Check statistics for: ", feat_name, True)
        # check PCA
        if feat_name in all_pca:
            pca = ds[all_pca[feat_name]]
            n = ds[feat_name].shape[0]
            nb_feats = ds[feat_name].shape[-1]
            fail_test = False
            # performing PCA on random samples
            for i in range(nb_samples):
                start = np.random.randint(0, n - nb_samples - 1)
                X = pca.transform(ds[feat_name][start:(start + nb_samples)],
                                  n_components=max(nb_feats // 2, 1))
                if np.any(np.isnan(X)):
                    logger("NaN values in PCA", feat_name, False)
                    fail_test = True
                    break
                if np.all(np.isclose(X, 0.)):
                    logger("All-closed-zeros values in PCA", feat_name, False)
                    fail_test = True
                    break
            if not fail_test:
                logger("Check PCA for: ", feat_name, True)
    # ====== Do sampling ====== #
    np.random.seed(seed)  # seed for reproceducible
    all_samples = np.random.choice(list(ds['indices'].keys()),
                                   size=nb_samples,
                                   replace=False)
    # plotting all samples
    for sample_id, file_name in enumerate(all_samples):
        X = {}
        for feat_name in all_features:
            start, end = ds.find_prefix(feat_name, 'indices')[file_name]
            feat = ds[feat_name][start:end]
            X[feat_name] = feat
            # some special handling
            try:
                _special_cases(X=feat,
                               feat_name=feat_name,
                               file_name=file_name,
                               ds=ds,
                               path=path)
            except Exception as e:
                logger("Special case error: %s" % str(e),
                       file_name + ':' + feat_name, False)
        plot_multiple_features(X, title=file_name, fig_width=fig_width)
        figure_path = os.path.join(path,
                                   '%s.pdf' % _escape_file_name(file_name))
        plot_save(figure_path, log=False, clear_all=True)
        logger("Sample figure saved at: ", figure_path, True)
    # plotting the statistic
    figure_path = os.path.join(path, 'stats.pdf')
    for feat_name, stat_name in all_stats.items():
        X = {name: ds[name][:] for name in stat_name if ds[name].ndim >= 1}
        if len(X) > 0:
            plot_multiple_features(X, title=feat_name, fig_width=fig_width)
    plot_save(figure_path, log=False, clear_all=True)
    logger("Stats figure save at: ", figure_path, True)
    logger("All reports at folder: ", os.path.abspath(path), True)
    # ====== cleaning ====== #
    stdio(path=prev_stdio)
    if should_close_ds:
        ds.close()
コード例 #5
0
ファイル: train_xvec.py プロジェクト: imito/odin
# ===========================================================================
args = args_parse([
    ('recipe', 'the name of function defined in feature_recipes.py', None),
    ('-feat', "Acoustic feature", ('mspec', 'mfcc'), 'mspec'),
    ('-batch', "batch size", None, 64),
    ('-epoch', "number of epoch", None, 25),
    ('-l', "audio segmenting length in second", None, 3),
    ('--debug', "enable debug mode", None, False),
    ('--train', "force continue training the saved model", None, False),
])
FEAT = args.feat
TRAIN_MODEL = args.train
DEBUG = bool(args.debug)
(EXP_DIR, MODEL_PATH, LOG_PATH,
 TRAIN_PATH, TEST_PATH) = get_model_path('xvec', args)
stdio(LOG_PATH)
# ===========================================================================
# Create data feeder
# ===========================================================================
(train, valid,
 test_ids, test_dat,
 all_speakers) = prepare_dnn_data(
    recipe=args.recipe, feat=FEAT, utt_length=args.l)
n_speakers = len(all_speakers) + 1
# ===========================================================================
# Create the network
# ===========================================================================
inputs = [K.placeholder(shape=(None,) + shape[1:],
                        dtype='float32',
                        name='input%d' % i)
          for i, shape in enumerate(as_tuple_of_shape(train.shape))]
コード例 #6
0
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

epochs = 200
batch_size = 128
max_evals = 80
algorithm = 'bayes'
freq = 1000  # mean that only run on_train_end

path = '/tmp/autotune'
if os.path.exists(path):
    shutil.rmtree(path)
os.mkdir(path)

# sc_metrics more robust to NaN values
# TODO: accept a list of loss_name
stdio(os.path.join(path, 'fit_hyper.txt'))
# ===========================================================================
# Cortext
# ===========================================================================
x, y = get_dataset('cortex')
x.filter_cells(min_counts=1).filter_genes(min_counts=1)
gene = x.shape[1]
prot = y.shape[1]

SCVI.fit_hyper(x,
               loss_name='nllk0',
               model_kwargs=dict(units=gene, xdist='zinbd'),
               fit_kwargs=dict(epochs=epochs,
                               batch_size=batch_size,
                               callbacks=[NegativeLogLikelihood(freq=freq)]),
               max_evals=max_evals,
コード例 #7
0
# CONST
# ===========================================================================
# ====== main path ====== #
# inpath = "/mnt/sdb1/TIDIGITS"
inpath = args.path
outpath = '/home/trung/data/TIDIGITS_wav'
compress_path = '/home/trung/data/TIDIGITS.zip'
# ====== others ====== #
wav_path = os.path.join(inpath, "wave")
infopath = os.path.join(inpath, 'data/children/doc/spkrinfo.txt')
logpath = os.path.join(inpath, 'log.txt')
print("Input path:       ", ctext(inpath, 'cyan'))
print("Output path:      ", ctext(outpath, 'cyan'))
print("Convert to WAV at:", ctext(wav_path, 'cyan'))
print("Log path:         ", ctext(logpath, 'cyan'))
stdio(logpath)

exts = get_all_ext(inpath)
audio_files = get_all_files(
    inpath,
    filter_func=lambda f: f[-4:] == '.wav' and f.split('/')[-3] in
    ('girl', 'boy', 'man', 'woman'))
# ID     Gender     Age     Dialect    Usage
# ID - Unique 2-character speaker identifier
# Gender - (M-man, W-woman, B-boy, G-girl)
# Age - Speaker age at time of recording
# Dialect - Dialect region identifier (see file "dialects.txt" for decode)
# Usage - (TST-test material, TRN-training material)
info = np.genfromtxt(infopath, dtype=str, skip_header=12)
info = {
    ID.lower(): (Gender.lower(), Age, Dialect, Usage)
コード例 #8
0
ファイル: float16_vs_float32.py プロジェクト: imito/odin
# => Gaussian normalized is better, and float16 is no different from float32
# ===========================================================================
from __future__ import print_function, division, absolute_import

import numpy as np

import os
os.environ['ODIN'] = 'float32,gpu,theano,seed=12,cnmem=0.4'

from odin import backend as K
from odin import nnet as N
from odin import fuel, training
from odin.utils import get_modelpath, ArgController, stdio, get_logpath
from six.moves import cPickle

stdio(get_logpath('tmp.log'))

# ===========================================================================
# Load data
# ===========================================================================
ds = fuel.load_cifar10()
print(ds)

X_train = K.placeholder(shape=(None,) + ds['X_train'].shape[1:], name='X_train')
X_score = K.placeholder(shape=(None,) + ds['X_train'].shape[1:], name='X_score')
y = K.placeholder(shape=(None,), name='y', dtype='int32')

# ===========================================================================
# Build network
# ===========================================================================
ops = N.Sequence([
コード例 #9
0
ファイル: float16_vs_float32.py プロジェクト: liqin123/odin
# ===========================================================================
from __future__ import print_function, division, absolute_import

import numpy as np

import os

os.environ['ODIN'] = 'float32,gpu,theano,seed=12,cnmem=0.4'

from odin import backend as K
from odin import nnet as N
from odin import fuel, training
from odin.utils import get_modelpath, ArgController, stdio, get_logpath
from six.moves import cPickle

stdio(get_logpath('tmp.log'))

# ===========================================================================
# Load data
# ===========================================================================
ds = fuel.load_cifar10()
print(ds)

X_train = K.placeholder(shape=(None, ) + ds['X_train'].shape[1:],
                        name='X_train')
X_score = K.placeholder(shape=(None, ) + ds['X_train'].shape[1:],
                        name='X_score')
y = K.placeholder(shape=(None, ), name='y', dtype='int32')

# ===========================================================================
# Build network
コード例 #10
0
ファイル: make_score.py プロジェクト: trungnt13/odin-ai
from odin import preprocessing as pp
from odin import fuel as F, nnet as N, backend as K
from odin.utils import (get_module_from_path, get_script_path, ctext, Progbar,
                        stdio, get_logpath, get_formatted_datetime)
from odin.stats import describe

from helpers import (SCORING_DATASETS, BACKEND_DATASETS, SCORE_SYSTEM_NAME,
                     SCORE_SYSTEM_ID, N_PLDA, N_LDA, PLDA_MAXIMUM_LIKELIHOOD,
                     PLDA_SHOW_LLK, PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE,
                     FEATURE_NAME, get_model_path, NCPU, get_logpath,
                     prepare_dnn_feeder_recipe, sre_file_list, Config, EXP_DIR,
                     VECTORS_DIR, RESULT_DIR, filter_utterances)
# ====== scoring log ====== #
stdio(
    get_logpath(name='make_score.log',
                increasing=True,
                odin_base=False,
                root=EXP_DIR))
print('=' * 48)
print(get_formatted_datetime(only_number=False))
print("System name    :", SCORE_SYSTEM_NAME)
print("System id      :", SCORE_SYSTEM_ID)
print("Feature recipe :", FEATURE_RECIPE)
print("Feature name   :", FEATURE_NAME)
print("Backend dataset:", ','.join(BACKEND_DATASETS.keys()))
print("Scoring dataset:", ','.join(SCORING_DATASETS.keys()))
print('=' * 48)


# ===========================================================================
# Some helper
コード例 #11
0
ファイル: tidigits_preprocessor.py プロジェクト: imito/odin
# CONST
# ===========================================================================
# ====== main path ====== #
# inpath = "/mnt/sdb1/TIDIGITS"
inpath = args.path
outpath = '/home/trung/data/TIDIGITS_wav'
compress_path = '/home/trung/data/TIDIGITS.zip'
# ====== others ====== #
wav_path = os.path.join(inpath, "wave")
infopath = os.path.join(inpath, 'data/children/doc/spkrinfo.txt')
logpath = os.path.join(inpath, 'log.txt')
print("Input path:       ", ctext(inpath, 'cyan'))
print("Output path:      ", ctext(outpath, 'cyan'))
print("Convert to WAV at:", ctext(wav_path, 'cyan'))
print("Log path:         ", ctext(logpath, 'cyan'))
stdio(logpath)

exts = get_all_ext(inpath)
audio_files = get_all_files(inpath,
                filter_func=lambda f: f[-4:] == '.wav' and
                            f.split('/')[-3] in ('girl', 'boy', 'man', 'woman'))
# ID     Gender     Age     Dialect    Usage
# ID - Unique 2-character speaker identifier
# Gender - (M-man, W-woman, B-boy, G-girl)
# Age - Speaker age at time of recording
# Dialect - Dialect region identifier (see file "dialects.txt" for decode)
# Usage - (TST-test material, TRN-training material)
info = np.genfromtxt(infopath, dtype=str, skip_header=12)
info = {ID.lower(): (Gender.lower(), Age, Dialect, Usage)
        for ID, Gender, Age, Dialect, Usage in info}
gender_map = {
コード例 #12
0
ファイル: analyze_data.py プロジェクト: imito/odin
from sklearn.metrics import accuracy_score, log_loss, f1_score

from odin import fuel as F
from odin import nnet as N, backend as K
from odin import visual as V
from odin.utils import (ctext, mpi, Progbar, catch_warnings_ignore, stdio,
                        get_logpath, catch_warnings_ignore)


from helpers import (FEATURE_RECIPE, FEATURE_NAME, PATH_ACOUSTIC_FEATURES,
                     MINIMUM_UTT_DURATION, ANALYSIS_DIR, Config,
                     filter_utterances, prepare_dnn_data)

# ====== prepare log ====== #
stdio(get_logpath(name="analyze_data.log", increasing=True,
                  odin_base=False, root=ANALYSIS_DIR))
print(ctext(FEATURE_RECIPE, 'lightyellow'))
print(ctext(FEATURE_NAME, 'lightyellow'))
assert os.path.isdir(os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE))
# ====== essential path ====== #
figure_path = os.path.join(ANALYSIS_DIR, '%s_%s.pdf' %
                           (FEATURE_RECIPE.replace('_', ''), FEATURE_NAME))
print(ctext(figure_path, 'lightyellow'))
# ===========================================================================
# Load the data
# ===========================================================================
ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE),
               read_only=True)
X = ds[FEATURE_NAME]
# remove all noise data
indices = {name: (start, end)
コード例 #13
0
# ====== import ====== #
import os

os.environ['ODIN'] = 'float32,%s,%s' % (args['dev'], args['bk'])

import numpy as np

np.random.seed(1208)

from odin import nnet as N, backend as K, fuel as F, stats
from odin.utils import get_modelpath, stdio, get_logpath, get_datasetpath
from odin.basic import has_roles, BIAS, WEIGHT
from odin import training

# set log path
stdio(path=get_logpath('digit_audio.log', override=True))

# ===========================================================================
# Get wav and process new dataset configuration
# ===========================================================================
# ====== process new features ====== #
if False:
    datapath = F.load_digit_wav()
    output_path = get_datasetpath(name='digit', override=True)
    feat = F.SpeechProcessor(datapath,
                             output_path,
                             audio_ext='wav',
                             sr_new=8000,
                             win=0.025,
                             shift=0.01,
                             nb_melfilters=40,
コード例 #14
0
import os
os.environ['ODIN'] = 'float32,gpu'

import numpy as np

from odin import backend as K, nnet as N, visual as V
from odin import preprocessing as pp
from odin.utils import (args_parse, stdio, get_module_from_path,
                        get_script_path)
from odin.utils.mpi import cpu_count

from utils import (WAV_FILES, SAMPLED_WAV_FILE, PATH_ACOUSTIC_FEAT, PATH_EXP)
# ===========================================================================
# Config
# ===========================================================================
stdio(os.path.join(PATH_EXP, 'features_extraction.log'))
args = args_parse(
    descriptions=[('recipe',
                   'the name of function defined in feature_recipes.py',
                   None), ('--debug', 'enable debug or not', None, False)])
DEBUG = args.debug
# ===========================================================================
# Create the recipes
# ===========================================================================
extractor = get_module_from_path(identifier=str(args.recipe),
                                 prefix='feature_recipes',
                                 path=get_script_path())
assert len(extractor) > 0, \
"Cannot find any recipe with name: '%s' from path: '%s'" % (args.recipe, get_script_path())
recipe = extractor[0](DEBUG)
# ====== debugging ====== #
コード例 #15
0
#!/usr/bin/env python
from __future__ import print_function, division, absolute_import

import numpy as np

from odin.utils import get_modelpath, ArgController, stdio, get_logpath

stdio(get_logpath('tmp.log', override=True))

arg = ArgController(version=0.12).add(
    '-backend', 'theano or tensorflow',
    'tensorflow').add('-ds', 'dataset cifar10, or mnist',
                      'mnist').add('-epoch', 'number of epoch',
                                   3).add('-lr', 'learning rate',
                                          0.01).parse()

import os
os.environ['ODIN'] = 'float32,gpu,%s,seed=12' % arg['backend']

from odin import backend as K
from odin import nnet as N
from odin import fuel, training
from six.moves import cPickle

# ===========================================================================
# Load data
# ===========================================================================
USE_MNIST_DATA = True if 'mnist' in arg['ds'].lower() else False

if USE_MNIST_DATA:
    ds = fuel.load_mnist()
コード例 #16
0
ファイル: analyze_data.py プロジェクト: trungnt13/odin-ai
from sklearn.metrics import accuracy_score, log_loss, f1_score

from odin import fuel as F
from odin import nnet as N, backend as K
from odin import visual as V
from odin.utils import (ctext, mpi, Progbar, catch_warnings_ignore, stdio,
                        get_logpath, catch_warnings_ignore)

from helpers import (FEATURE_RECIPE, FEATURE_NAME, PATH_ACOUSTIC_FEATURES,
                     MINIMUM_UTT_DURATION, ANALYSIS_DIR, Config,
                     filter_utterances, prepare_dnn_data)

# ====== prepare log ====== #
stdio(
    get_logpath(name="analyze_data.log",
                increasing=True,
                odin_base=False,
                root=ANALYSIS_DIR))
print(ctext(FEATURE_RECIPE, 'lightyellow'))
print(ctext(FEATURE_NAME, 'lightyellow'))
assert os.path.isdir(os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE))
# ====== essential path ====== #
figure_path = os.path.join(
    ANALYSIS_DIR,
    '%s_%s.pdf' % (FEATURE_RECIPE.replace('_', ''), FEATURE_NAME))
print(ctext(figure_path, 'lightyellow'))
# ===========================================================================
# Load the data
# ===========================================================================
ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE),
               read_only=True)
コード例 #17
0
ファイル: processor.py プロジェクト: imito/odin
def validate_features(ds_or_processor, path, nb_samples=25,
                      override=False, seed=12082518, fig_width=4):
  # TODO: add PCA visualization
  # TODO: update to match new indices style
  def logger(title, tag, check):
    check = bool(check)
    text_color = 'yellow' if check else 'red'
    print(ctext('   *', 'cyan'),
          ctext(str(title), text_color),
          ctext(str(tag), 'magenta'),
          ctext("✓", text_color) if check else ctext("✗", text_color))
  import matplotlib
  matplotlib.use('Agg')
  from odin.visual import plot_save, plot_multiple_features
  # ====== check path to dataset ====== #
  should_close_ds = True
  if isinstance(ds_or_processor, FeatureProcessor):
    ds = Dataset(ds_or_processor.path, read_only=True)
  elif is_string(ds_or_processor):
    ds = Dataset(ds_or_processor, read_only=True)
  elif isinstance(ds_or_processor, Dataset):
    ds = ds_or_processor
    should_close_ds = False
  else:
    raise ValueError("`ds` can be None, string, or Dataset. No "
                     "support for given input type: %s" % str(type(ds)))
  print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path)
  # ====== extract the config of the dataset ====== #
  if 'config' not in ds:
    raise RuntimeError("The `Dataset` must be generated by `FeatureProcessor` "
                       "which must contain `config` MmapDict of extracted "
                       "features configuration.")
  # config = ds['config']
  # pipeline = ds['pipeline']
  # ====== output path ====== #
  path = str(path)
  if not os.path.exists(path):
    os.mkdir(path)
  elif override:
    if os.path.isfile(path):
      os.remove(path)
    else:
      shutil.rmtree(path)
    os.mkdir(path)
  else:
    raise ValueError("`path`=%s exists, cannot override." % path)
  prev_stdio = get_stdio_path()
  stdio(path=os.path.join(path, 'log.txt'))
  nb_samples = int(nb_samples)
  # ====== get all features ====== #
  # [(name, dtype, statistic-able), ...]
  all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')]
  # store all features (included the features in external_indices
  all_features = []
  # the external indices can be: indices_mfcc_bnf
  external_indices = flatten_list([k.split('_')[1:] for k in all_keys
                                   if 'indices' in k and k != 'indices'])
  # ====== checking indices ====== #
  main_indices = {name: (start, end)
                  for name, (start, end) in ds['indices'].items()}
  for ids_name in (k for k in all_keys if 'indices' in k):
    ids = sorted([(name, start, end)
                  for name, (start, end) in ds[ids_name].items()],
                 key=lambda x: x[1])
    for prev, now in zip(ids, ids[1:]):
      assert prev[2] == now[1], "Zero length in indices"
      assert prev[2] - prev[1] > 0, "Zero length in indices"
      assert now[2] - now[1] > 0, "Zero length in indices"
    # final length match length of Data
    if ids_name != 'indices':
      for feat_name in ids_name.split('_')[1:]:
        assert now[-1] == len(ds[feat_name]), \
            "Indices and data length mismatch, indices:'%s' feat:'%s'" % \
            (ids_name, feat_name)
        all_features.append(feat_name)
    else:
      for feat_name in all_keys:
        if feat_name not in external_indices and \
        'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \
        'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \
        isinstance(ds[feat_name], MmapData):
          assert now[-1] == len(ds[feat_name]), \
          "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name
          all_features.append(feat_name)
    # logging
    logger("Checked all:", ids_name, True)
  # ====== check all dictionary types ====== #
  for name in all_keys:
    if isinstance(ds[name], MmapDict) and 'indices' not in name:
      data = ds[name]
      # special cases
      if name == 'sr':
        checking_func = lambda x: x > 0 # for sr
      else:
        checking_func = lambda x: True
      # check
      for key, val in data.items():
        assert key in main_indices, \
        "Dictionary with name:'%s' has key not found in indices." % name
        assert checking_func(val)
      logger("Checked dictionary: ", name, True)
  # ====== checking each type of data ====== #
  # get all stats name
  all_stats = defaultdict(list)
  for k in all_keys:
    if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \
    'mean' == k[-4:] or 'std' == k[-3:]:
      all_stats[k[:-4].split('_')[0]].append(k)
  # get all pca name
  all_pca = {i: i + '_pca' for i in all_features
             if i + '_pca' in ds}
  # checking one-by-one numpy.ndarray features array
  for feat_name in all_features:
    dtype = str(ds[feat_name].dtype)
    # checking all data
    indices = ds.find_prefix(feat_name, 'indices')
    prog = Progbar(target=len(indices), interval=0.1,
                   print_report=True,
                   name='Checking: %s(%s)' % (feat_name, dtype))
    # start iterating over all data file
    fail_test = False
    for file_name, (start, end) in indices:
      dat = ds[feat_name][start:end]
      # No NaN value
      if np.any(np.isnan(dat)):
        logger("NaN values", file_name + ':' + feat_name, False)
        fail_test = True
      # not all value closed to zeros
      if np.all(np.isclose(dat, 0.)):
        logger("All-closed-zeros values", file_name + ':' + feat_name,
               False)
        fail_test = True
      prog['Name'] = file_name
      prog.add(1)
    if not fail_test:
      logger("Check data incredibility for: ", feat_name, True)
    # checking statistics
    if feat_name in all_stats:
      fail_test = False
      for stat_name in all_stats[feat_name]:
        X = ds[stat_name]
        if X.ndim >= 1:
          X = X[:]
        if np.any(np.isnan(X)):
          logger("NaN values", feat_name + ':' + stat_name, False)
          fail_test = True
        if np.all(np.isclose(X, 0.)):
          logger("All-closed-zeros values", feat_name + ':' + stat_name,
                 False)
          fail_test = True
      if not fail_test:
        logger("Check statistics for: ", feat_name, True)
    # check PCA
    if feat_name in all_pca:
      pca = ds[all_pca[feat_name]]
      n = ds[feat_name].shape[0]
      nb_feats = ds[feat_name].shape[-1]
      fail_test = False
      # performing PCA on random samples
      for i in range(nb_samples):
        start = np.random.randint(0, n - nb_samples - 1)
        X = pca.transform(
            ds[feat_name][start:(start + nb_samples)],
            n_components=max(nb_feats // 2, 1))
        if np.any(np.isnan(X)):
          logger("NaN values in PCA", feat_name, False)
          fail_test = True
          break
        if np.all(np.isclose(X, 0.)):
          logger("All-closed-zeros values in PCA", feat_name, False)
          fail_test = True
          break
      if not fail_test:
        logger("Check PCA for: ", feat_name, True)
  # ====== Do sampling ====== #
  np.random.seed(seed) # seed for reproceducible
  all_samples = np.random.choice(list(ds['indices'].keys()),
                                 size=nb_samples,
                                 replace=False)
  # plotting all samples
  for sample_id, file_name in enumerate(all_samples):
    X = {}
    for feat_name in all_features:
      start, end = ds.find_prefix(feat_name, 'indices')[file_name]
      feat = ds[feat_name][start:end]
      X[feat_name] = feat
      # some special handling
      try:
        _special_cases(X=feat, feat_name=feat_name, file_name=file_name,
                       ds=ds, path=path)
      except Exception as e:
        logger("Special case error: %s" % str(e),
               file_name + ':' + feat_name, False)
    plot_multiple_features(X, title=file_name, fig_width=fig_width)
    figure_path = os.path.join(path, '%s.pdf' % _escape_file_name(file_name))
    plot_save(figure_path, log=False, clear_all=True)
    logger("Sample figure saved at: ", figure_path, True)
  # plotting the statistic
  figure_path = os.path.join(path, 'stats.pdf')
  for feat_name, stat_name in all_stats.items():
    X = {name: ds[name][:]
         for name in stat_name
         if ds[name].ndim >= 1}
    if len(X) > 0:
      plot_multiple_features(X, title=feat_name, fig_width=fig_width)
  plot_save(figure_path, log=False, clear_all=True)
  logger("Stats figure save at: ", figure_path, True)
  logger("All reports at folder: ", os.path.abspath(path), True)
  # ====== cleaning ====== #
  stdio(path=prev_stdio)
  if should_close_ds:
    ds.close()