os.environ['ODIN'] = 'float32,gpu' import numpy as np from odin import backend as K, nnet as N, visual as V from odin import preprocessing as pp from odin.utils import (args_parse, stdio, get_module_from_path, get_script_path) from odin.utils.mpi import cpu_count from utils import (WAV_FILES, SAMPLED_WAV_FILE, PATH_ACOUSTIC_FEAT, PATH_EXP) # =========================================================================== # Config # =========================================================================== stdio(os.path.join(PATH_EXP, 'features_extraction.log')) args = args_parse(descriptions=[ ('recipe', 'the name of function defined in feature_recipes.py', None), ('--debug', 'enable debug or not', None, False) ]) DEBUG = args.debug # =========================================================================== # Create the recipes # =========================================================================== extractor = get_module_from_path(identifier=str(args.recipe), prefix='feature_recipes', path=get_script_path()) assert len(extractor) > 0, \ "Cannot find any recipe with name: '%s' from path: '%s'" % (args.recipe, get_script_path()) recipe = extractor[0](DEBUG) # ====== debugging ====== #
).add('-model', 'model name, specified in models_cifar.py', 'cnn' ).parse() import os os.environ['ODIN'] = 'float32,gpu,seed=87654321,log' import numpy as np import tensorflow as tf from odin import fuel as F, nnet as N, backend as K, training, utils from odin.stats import train_valid_test_split MODEL_NAME = args.model MODEL_PATH = utils.get_modelpath(name='cifar10_%s' % MODEL_NAME, override=True) LOG_PATH = utils.get_logpath(name='cifar10_%s.log' % MODEL_NAME, override=True) stdio(LOG_PATH) # =========================================================================== # Some handmade constants # =========================================================================== NB_EPOCH = 10 LEARNING_RATE = 0.001 # =========================================================================== # Load dataset # =========================================================================== ds = F.CIFAR10.get_dataset() nb_labels = 10 print(ds) X_train = ds['X_train'][:].astype('float32') / 255. y_train = one_hot(ds['y_train'][:], nb_classes=nb_labels) X_test = ds['X_test'][:].astype('float32') / 255. y_test = one_hot(ds['y_test'][:], nb_classes=nb_labels)
False).parse() no_train = flags.no_train no_score = flags.no_score analyze = flags.analyze # assume the scores were ready when analyze is enable if analyze: no_train = True no_score = True # =========================================================================== # Configurations # =========================================================================== path = '/tmp/grid' if not os.path.exists(path): os.mkdir(path) stdio(os.path.join(path, 'log.txt')) gene = x_train.shape[1] prot = y_train.shape[1] epochs = 200 batch_size = 128 ncpu = 4 # =========================================================================== # Generate all jobs # =========================================================================== jobs = [] for nlayers in [1, 2, 3]: for hdim in [32, 128, 512]: for zdim in [16, 32, 64]: for model in [
def validate_features(ds_or_processor, path, nb_samples=25, override=False, seed=12082518, fig_width=4): # TODO: add PCA visualization # TODO: update to match new indices style def logger(title, tag, check): check = bool(check) text_color = 'yellow' if check else 'red' print(ctext(' *', 'cyan'), ctext(str(title), text_color), ctext(str(tag), 'magenta'), ctext("✓", text_color) if check else ctext("✗", text_color)) import matplotlib matplotlib.use('Agg') from odin.visual import plot_save, plot_multiple_features # ====== check path to dataset ====== # should_close_ds = True if isinstance(ds_or_processor, FeatureProcessor): ds = Dataset(ds_or_processor.path, read_only=True) elif is_string(ds_or_processor): ds = Dataset(ds_or_processor, read_only=True) elif isinstance(ds_or_processor, Dataset): ds = ds_or_processor should_close_ds = False else: raise ValueError("`ds` can be None, string, or Dataset. No " "support for given input type: %s" % str(type(ds))) print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path) # ====== extract the config of the dataset ====== # if 'config' not in ds: raise RuntimeError( "The `Dataset` must be generated by `FeatureProcessor` " "which must contain `config` MmapDict of extracted " "features configuration.") # config = ds['config'] # pipeline = ds['pipeline'] # ====== output path ====== # path = str(path) if not os.path.exists(path): os.mkdir(path) elif override: if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path) os.mkdir(path) else: raise ValueError("`path`=%s exists, cannot override." % path) prev_stdio = get_stdio_path() stdio(path=os.path.join(path, 'log.txt')) nb_samples = int(nb_samples) # ====== get all features ====== # # [(name, dtype, statistic-able), ...] all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')] # store all features (included the features in external_indices all_features = [] # the external indices can be: indices_mfcc_bnf external_indices = flatten_list([ k.split('_')[1:] for k in all_keys if 'indices' in k and k != 'indices' ]) # ====== checking indices ====== # main_indices = { name: (start, end) for name, (start, end) in ds['indices'].items() } for ids_name in (k for k in all_keys if 'indices' in k): ids = sorted([(name, start, end) for name, (start, end) in ds[ids_name].items()], key=lambda x: x[1]) for prev, now in zip(ids, ids[1:]): assert prev[2] == now[1], "Zero length in indices" assert prev[2] - prev[1] > 0, "Zero length in indices" assert now[2] - now[1] > 0, "Zero length in indices" # final length match length of Data if ids_name != 'indices': for feat_name in ids_name.split('_')[1:]: assert now[-1] == len(ds[feat_name]), \ "Indices and data length mismatch, indices:'%s' feat:'%s'" % \ (ids_name, feat_name) all_features.append(feat_name) else: for feat_name in all_keys: if feat_name not in external_indices and \ 'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \ 'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \ isinstance(ds[feat_name], MmapData): assert now[-1] == len(ds[feat_name]), \ "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name all_features.append(feat_name) # logging logger("Checked all:", ids_name, True) # ====== check all dictionary types ====== # for name in all_keys: if isinstance(ds[name], MmapDict) and 'indices' not in name: data = ds[name] # special cases if name == 'sr': checking_func = lambda x: x > 0 # for sr else: checking_func = lambda x: True # check for key, val in data.items(): assert key in main_indices, \ "Dictionary with name:'%s' has key not found in indices." % name assert checking_func(val) logger("Checked dictionary: ", name, True) # ====== checking each type of data ====== # # get all stats name all_stats = defaultdict(list) for k in all_keys: if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \ 'mean' == k[-4:] or 'std' == k[-3:]: all_stats[k[:-4].split('_')[0]].append(k) # get all pca name all_pca = {i: i + '_pca' for i in all_features if i + '_pca' in ds} # checking one-by-one numpy.ndarray features array for feat_name in all_features: dtype = str(ds[feat_name].dtype) # checking all data indices = ds.find_prefix(feat_name, 'indices') prog = Progbar(target=len(indices), interval=0.1, print_report=True, name='Checking: %s(%s)' % (feat_name, dtype)) # start iterating over all data file fail_test = False for file_name, (start, end) in indices: dat = ds[feat_name][start:end] # No NaN value if np.any(np.isnan(dat)): logger("NaN values", file_name + ':' + feat_name, False) fail_test = True # not all value closed to zeros if np.all(np.isclose(dat, 0.)): logger("All-closed-zeros values", file_name + ':' + feat_name, False) fail_test = True prog['Name'] = file_name prog.add(1) if not fail_test: logger("Check data incredibility for: ", feat_name, True) # checking statistics if feat_name in all_stats: fail_test = False for stat_name in all_stats[feat_name]: X = ds[stat_name] if X.ndim >= 1: X = X[:] if np.any(np.isnan(X)): logger("NaN values", feat_name + ':' + stat_name, False) fail_test = True if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values", feat_name + ':' + stat_name, False) fail_test = True if not fail_test: logger("Check statistics for: ", feat_name, True) # check PCA if feat_name in all_pca: pca = ds[all_pca[feat_name]] n = ds[feat_name].shape[0] nb_feats = ds[feat_name].shape[-1] fail_test = False # performing PCA on random samples for i in range(nb_samples): start = np.random.randint(0, n - nb_samples - 1) X = pca.transform(ds[feat_name][start:(start + nb_samples)], n_components=max(nb_feats // 2, 1)) if np.any(np.isnan(X)): logger("NaN values in PCA", feat_name, False) fail_test = True break if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values in PCA", feat_name, False) fail_test = True break if not fail_test: logger("Check PCA for: ", feat_name, True) # ====== Do sampling ====== # np.random.seed(seed) # seed for reproceducible all_samples = np.random.choice(list(ds['indices'].keys()), size=nb_samples, replace=False) # plotting all samples for sample_id, file_name in enumerate(all_samples): X = {} for feat_name in all_features: start, end = ds.find_prefix(feat_name, 'indices')[file_name] feat = ds[feat_name][start:end] X[feat_name] = feat # some special handling try: _special_cases(X=feat, feat_name=feat_name, file_name=file_name, ds=ds, path=path) except Exception as e: logger("Special case error: %s" % str(e), file_name + ':' + feat_name, False) plot_multiple_features(X, title=file_name, fig_width=fig_width) figure_path = os.path.join(path, '%s.pdf' % _escape_file_name(file_name)) plot_save(figure_path, log=False, clear_all=True) logger("Sample figure saved at: ", figure_path, True) # plotting the statistic figure_path = os.path.join(path, 'stats.pdf') for feat_name, stat_name in all_stats.items(): X = {name: ds[name][:] for name in stat_name if ds[name].ndim >= 1} if len(X) > 0: plot_multiple_features(X, title=feat_name, fig_width=fig_width) plot_save(figure_path, log=False, clear_all=True) logger("Stats figure save at: ", figure_path, True) logger("All reports at folder: ", os.path.abspath(path), True) # ====== cleaning ====== # stdio(path=prev_stdio) if should_close_ds: ds.close()
# =========================================================================== args = args_parse([ ('recipe', 'the name of function defined in feature_recipes.py', None), ('-feat', "Acoustic feature", ('mspec', 'mfcc'), 'mspec'), ('-batch', "batch size", None, 64), ('-epoch', "number of epoch", None, 25), ('-l', "audio segmenting length in second", None, 3), ('--debug', "enable debug mode", None, False), ('--train', "force continue training the saved model", None, False), ]) FEAT = args.feat TRAIN_MODEL = args.train DEBUG = bool(args.debug) (EXP_DIR, MODEL_PATH, LOG_PATH, TRAIN_PATH, TEST_PATH) = get_model_path('xvec', args) stdio(LOG_PATH) # =========================================================================== # Create data feeder # =========================================================================== (train, valid, test_ids, test_dat, all_speakers) = prepare_dnn_data( recipe=args.recipe, feat=FEAT, utt_length=args.l) n_speakers = len(all_speakers) + 1 # =========================================================================== # Create the network # =========================================================================== inputs = [K.placeholder(shape=(None,) + shape[1:], dtype='float32', name='input%d' % i) for i, shape in enumerate(as_tuple_of_shape(train.shape))]
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' epochs = 200 batch_size = 128 max_evals = 80 algorithm = 'bayes' freq = 1000 # mean that only run on_train_end path = '/tmp/autotune' if os.path.exists(path): shutil.rmtree(path) os.mkdir(path) # sc_metrics more robust to NaN values # TODO: accept a list of loss_name stdio(os.path.join(path, 'fit_hyper.txt')) # =========================================================================== # Cortext # =========================================================================== x, y = get_dataset('cortex') x.filter_cells(min_counts=1).filter_genes(min_counts=1) gene = x.shape[1] prot = y.shape[1] SCVI.fit_hyper(x, loss_name='nllk0', model_kwargs=dict(units=gene, xdist='zinbd'), fit_kwargs=dict(epochs=epochs, batch_size=batch_size, callbacks=[NegativeLogLikelihood(freq=freq)]), max_evals=max_evals,
# CONST # =========================================================================== # ====== main path ====== # # inpath = "/mnt/sdb1/TIDIGITS" inpath = args.path outpath = '/home/trung/data/TIDIGITS_wav' compress_path = '/home/trung/data/TIDIGITS.zip' # ====== others ====== # wav_path = os.path.join(inpath, "wave") infopath = os.path.join(inpath, 'data/children/doc/spkrinfo.txt') logpath = os.path.join(inpath, 'log.txt') print("Input path: ", ctext(inpath, 'cyan')) print("Output path: ", ctext(outpath, 'cyan')) print("Convert to WAV at:", ctext(wav_path, 'cyan')) print("Log path: ", ctext(logpath, 'cyan')) stdio(logpath) exts = get_all_ext(inpath) audio_files = get_all_files( inpath, filter_func=lambda f: f[-4:] == '.wav' and f.split('/')[-3] in ('girl', 'boy', 'man', 'woman')) # ID Gender Age Dialect Usage # ID - Unique 2-character speaker identifier # Gender - (M-man, W-woman, B-boy, G-girl) # Age - Speaker age at time of recording # Dialect - Dialect region identifier (see file "dialects.txt" for decode) # Usage - (TST-test material, TRN-training material) info = np.genfromtxt(infopath, dtype=str, skip_header=12) info = { ID.lower(): (Gender.lower(), Age, Dialect, Usage)
# => Gaussian normalized is better, and float16 is no different from float32 # =========================================================================== from __future__ import print_function, division, absolute_import import numpy as np import os os.environ['ODIN'] = 'float32,gpu,theano,seed=12,cnmem=0.4' from odin import backend as K from odin import nnet as N from odin import fuel, training from odin.utils import get_modelpath, ArgController, stdio, get_logpath from six.moves import cPickle stdio(get_logpath('tmp.log')) # =========================================================================== # Load data # =========================================================================== ds = fuel.load_cifar10() print(ds) X_train = K.placeholder(shape=(None,) + ds['X_train'].shape[1:], name='X_train') X_score = K.placeholder(shape=(None,) + ds['X_train'].shape[1:], name='X_score') y = K.placeholder(shape=(None,), name='y', dtype='int32') # =========================================================================== # Build network # =========================================================================== ops = N.Sequence([
# =========================================================================== from __future__ import print_function, division, absolute_import import numpy as np import os os.environ['ODIN'] = 'float32,gpu,theano,seed=12,cnmem=0.4' from odin import backend as K from odin import nnet as N from odin import fuel, training from odin.utils import get_modelpath, ArgController, stdio, get_logpath from six.moves import cPickle stdio(get_logpath('tmp.log')) # =========================================================================== # Load data # =========================================================================== ds = fuel.load_cifar10() print(ds) X_train = K.placeholder(shape=(None, ) + ds['X_train'].shape[1:], name='X_train') X_score = K.placeholder(shape=(None, ) + ds['X_train'].shape[1:], name='X_score') y = K.placeholder(shape=(None, ), name='y', dtype='int32') # =========================================================================== # Build network
from odin import preprocessing as pp from odin import fuel as F, nnet as N, backend as K from odin.utils import (get_module_from_path, get_script_path, ctext, Progbar, stdio, get_logpath, get_formatted_datetime) from odin.stats import describe from helpers import (SCORING_DATASETS, BACKEND_DATASETS, SCORE_SYSTEM_NAME, SCORE_SYSTEM_ID, N_PLDA, N_LDA, PLDA_MAXIMUM_LIKELIHOOD, PLDA_SHOW_LLK, PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE, FEATURE_NAME, get_model_path, NCPU, get_logpath, prepare_dnn_feeder_recipe, sre_file_list, Config, EXP_DIR, VECTORS_DIR, RESULT_DIR, filter_utterances) # ====== scoring log ====== # stdio( get_logpath(name='make_score.log', increasing=True, odin_base=False, root=EXP_DIR)) print('=' * 48) print(get_formatted_datetime(only_number=False)) print("System name :", SCORE_SYSTEM_NAME) print("System id :", SCORE_SYSTEM_ID) print("Feature recipe :", FEATURE_RECIPE) print("Feature name :", FEATURE_NAME) print("Backend dataset:", ','.join(BACKEND_DATASETS.keys())) print("Scoring dataset:", ','.join(SCORING_DATASETS.keys())) print('=' * 48) # =========================================================================== # Some helper
# CONST # =========================================================================== # ====== main path ====== # # inpath = "/mnt/sdb1/TIDIGITS" inpath = args.path outpath = '/home/trung/data/TIDIGITS_wav' compress_path = '/home/trung/data/TIDIGITS.zip' # ====== others ====== # wav_path = os.path.join(inpath, "wave") infopath = os.path.join(inpath, 'data/children/doc/spkrinfo.txt') logpath = os.path.join(inpath, 'log.txt') print("Input path: ", ctext(inpath, 'cyan')) print("Output path: ", ctext(outpath, 'cyan')) print("Convert to WAV at:", ctext(wav_path, 'cyan')) print("Log path: ", ctext(logpath, 'cyan')) stdio(logpath) exts = get_all_ext(inpath) audio_files = get_all_files(inpath, filter_func=lambda f: f[-4:] == '.wav' and f.split('/')[-3] in ('girl', 'boy', 'man', 'woman')) # ID Gender Age Dialect Usage # ID - Unique 2-character speaker identifier # Gender - (M-man, W-woman, B-boy, G-girl) # Age - Speaker age at time of recording # Dialect - Dialect region identifier (see file "dialects.txt" for decode) # Usage - (TST-test material, TRN-training material) info = np.genfromtxt(infopath, dtype=str, skip_header=12) info = {ID.lower(): (Gender.lower(), Age, Dialect, Usage) for ID, Gender, Age, Dialect, Usage in info} gender_map = {
from sklearn.metrics import accuracy_score, log_loss, f1_score from odin import fuel as F from odin import nnet as N, backend as K from odin import visual as V from odin.utils import (ctext, mpi, Progbar, catch_warnings_ignore, stdio, get_logpath, catch_warnings_ignore) from helpers import (FEATURE_RECIPE, FEATURE_NAME, PATH_ACOUSTIC_FEATURES, MINIMUM_UTT_DURATION, ANALYSIS_DIR, Config, filter_utterances, prepare_dnn_data) # ====== prepare log ====== # stdio(get_logpath(name="analyze_data.log", increasing=True, odin_base=False, root=ANALYSIS_DIR)) print(ctext(FEATURE_RECIPE, 'lightyellow')) print(ctext(FEATURE_NAME, 'lightyellow')) assert os.path.isdir(os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE)) # ====== essential path ====== # figure_path = os.path.join(ANALYSIS_DIR, '%s_%s.pdf' % (FEATURE_RECIPE.replace('_', ''), FEATURE_NAME)) print(ctext(figure_path, 'lightyellow')) # =========================================================================== # Load the data # =========================================================================== ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE), read_only=True) X = ds[FEATURE_NAME] # remove all noise data indices = {name: (start, end)
# ====== import ====== # import os os.environ['ODIN'] = 'float32,%s,%s' % (args['dev'], args['bk']) import numpy as np np.random.seed(1208) from odin import nnet as N, backend as K, fuel as F, stats from odin.utils import get_modelpath, stdio, get_logpath, get_datasetpath from odin.basic import has_roles, BIAS, WEIGHT from odin import training # set log path stdio(path=get_logpath('digit_audio.log', override=True)) # =========================================================================== # Get wav and process new dataset configuration # =========================================================================== # ====== process new features ====== # if False: datapath = F.load_digit_wav() output_path = get_datasetpath(name='digit', override=True) feat = F.SpeechProcessor(datapath, output_path, audio_ext='wav', sr_new=8000, win=0.025, shift=0.01, nb_melfilters=40,
import os os.environ['ODIN'] = 'float32,gpu' import numpy as np from odin import backend as K, nnet as N, visual as V from odin import preprocessing as pp from odin.utils import (args_parse, stdio, get_module_from_path, get_script_path) from odin.utils.mpi import cpu_count from utils import (WAV_FILES, SAMPLED_WAV_FILE, PATH_ACOUSTIC_FEAT, PATH_EXP) # =========================================================================== # Config # =========================================================================== stdio(os.path.join(PATH_EXP, 'features_extraction.log')) args = args_parse( descriptions=[('recipe', 'the name of function defined in feature_recipes.py', None), ('--debug', 'enable debug or not', None, False)]) DEBUG = args.debug # =========================================================================== # Create the recipes # =========================================================================== extractor = get_module_from_path(identifier=str(args.recipe), prefix='feature_recipes', path=get_script_path()) assert len(extractor) > 0, \ "Cannot find any recipe with name: '%s' from path: '%s'" % (args.recipe, get_script_path()) recipe = extractor[0](DEBUG) # ====== debugging ====== #
#!/usr/bin/env python from __future__ import print_function, division, absolute_import import numpy as np from odin.utils import get_modelpath, ArgController, stdio, get_logpath stdio(get_logpath('tmp.log', override=True)) arg = ArgController(version=0.12).add( '-backend', 'theano or tensorflow', 'tensorflow').add('-ds', 'dataset cifar10, or mnist', 'mnist').add('-epoch', 'number of epoch', 3).add('-lr', 'learning rate', 0.01).parse() import os os.environ['ODIN'] = 'float32,gpu,%s,seed=12' % arg['backend'] from odin import backend as K from odin import nnet as N from odin import fuel, training from six.moves import cPickle # =========================================================================== # Load data # =========================================================================== USE_MNIST_DATA = True if 'mnist' in arg['ds'].lower() else False if USE_MNIST_DATA: ds = fuel.load_mnist()
from sklearn.metrics import accuracy_score, log_loss, f1_score from odin import fuel as F from odin import nnet as N, backend as K from odin import visual as V from odin.utils import (ctext, mpi, Progbar, catch_warnings_ignore, stdio, get_logpath, catch_warnings_ignore) from helpers import (FEATURE_RECIPE, FEATURE_NAME, PATH_ACOUSTIC_FEATURES, MINIMUM_UTT_DURATION, ANALYSIS_DIR, Config, filter_utterances, prepare_dnn_data) # ====== prepare log ====== # stdio( get_logpath(name="analyze_data.log", increasing=True, odin_base=False, root=ANALYSIS_DIR)) print(ctext(FEATURE_RECIPE, 'lightyellow')) print(ctext(FEATURE_NAME, 'lightyellow')) assert os.path.isdir(os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE)) # ====== essential path ====== # figure_path = os.path.join( ANALYSIS_DIR, '%s_%s.pdf' % (FEATURE_RECIPE.replace('_', ''), FEATURE_NAME)) print(ctext(figure_path, 'lightyellow')) # =========================================================================== # Load the data # =========================================================================== ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE), read_only=True)
def validate_features(ds_or_processor, path, nb_samples=25, override=False, seed=12082518, fig_width=4): # TODO: add PCA visualization # TODO: update to match new indices style def logger(title, tag, check): check = bool(check) text_color = 'yellow' if check else 'red' print(ctext(' *', 'cyan'), ctext(str(title), text_color), ctext(str(tag), 'magenta'), ctext("✓", text_color) if check else ctext("✗", text_color)) import matplotlib matplotlib.use('Agg') from odin.visual import plot_save, plot_multiple_features # ====== check path to dataset ====== # should_close_ds = True if isinstance(ds_or_processor, FeatureProcessor): ds = Dataset(ds_or_processor.path, read_only=True) elif is_string(ds_or_processor): ds = Dataset(ds_or_processor, read_only=True) elif isinstance(ds_or_processor, Dataset): ds = ds_or_processor should_close_ds = False else: raise ValueError("`ds` can be None, string, or Dataset. No " "support for given input type: %s" % str(type(ds))) print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path) # ====== extract the config of the dataset ====== # if 'config' not in ds: raise RuntimeError("The `Dataset` must be generated by `FeatureProcessor` " "which must contain `config` MmapDict of extracted " "features configuration.") # config = ds['config'] # pipeline = ds['pipeline'] # ====== output path ====== # path = str(path) if not os.path.exists(path): os.mkdir(path) elif override: if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path) os.mkdir(path) else: raise ValueError("`path`=%s exists, cannot override." % path) prev_stdio = get_stdio_path() stdio(path=os.path.join(path, 'log.txt')) nb_samples = int(nb_samples) # ====== get all features ====== # # [(name, dtype, statistic-able), ...] all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')] # store all features (included the features in external_indices all_features = [] # the external indices can be: indices_mfcc_bnf external_indices = flatten_list([k.split('_')[1:] for k in all_keys if 'indices' in k and k != 'indices']) # ====== checking indices ====== # main_indices = {name: (start, end) for name, (start, end) in ds['indices'].items()} for ids_name in (k for k in all_keys if 'indices' in k): ids = sorted([(name, start, end) for name, (start, end) in ds[ids_name].items()], key=lambda x: x[1]) for prev, now in zip(ids, ids[1:]): assert prev[2] == now[1], "Zero length in indices" assert prev[2] - prev[1] > 0, "Zero length in indices" assert now[2] - now[1] > 0, "Zero length in indices" # final length match length of Data if ids_name != 'indices': for feat_name in ids_name.split('_')[1:]: assert now[-1] == len(ds[feat_name]), \ "Indices and data length mismatch, indices:'%s' feat:'%s'" % \ (ids_name, feat_name) all_features.append(feat_name) else: for feat_name in all_keys: if feat_name not in external_indices and \ 'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \ 'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \ isinstance(ds[feat_name], MmapData): assert now[-1] == len(ds[feat_name]), \ "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name all_features.append(feat_name) # logging logger("Checked all:", ids_name, True) # ====== check all dictionary types ====== # for name in all_keys: if isinstance(ds[name], MmapDict) and 'indices' not in name: data = ds[name] # special cases if name == 'sr': checking_func = lambda x: x > 0 # for sr else: checking_func = lambda x: True # check for key, val in data.items(): assert key in main_indices, \ "Dictionary with name:'%s' has key not found in indices." % name assert checking_func(val) logger("Checked dictionary: ", name, True) # ====== checking each type of data ====== # # get all stats name all_stats = defaultdict(list) for k in all_keys: if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \ 'mean' == k[-4:] or 'std' == k[-3:]: all_stats[k[:-4].split('_')[0]].append(k) # get all pca name all_pca = {i: i + '_pca' for i in all_features if i + '_pca' in ds} # checking one-by-one numpy.ndarray features array for feat_name in all_features: dtype = str(ds[feat_name].dtype) # checking all data indices = ds.find_prefix(feat_name, 'indices') prog = Progbar(target=len(indices), interval=0.1, print_report=True, name='Checking: %s(%s)' % (feat_name, dtype)) # start iterating over all data file fail_test = False for file_name, (start, end) in indices: dat = ds[feat_name][start:end] # No NaN value if np.any(np.isnan(dat)): logger("NaN values", file_name + ':' + feat_name, False) fail_test = True # not all value closed to zeros if np.all(np.isclose(dat, 0.)): logger("All-closed-zeros values", file_name + ':' + feat_name, False) fail_test = True prog['Name'] = file_name prog.add(1) if not fail_test: logger("Check data incredibility for: ", feat_name, True) # checking statistics if feat_name in all_stats: fail_test = False for stat_name in all_stats[feat_name]: X = ds[stat_name] if X.ndim >= 1: X = X[:] if np.any(np.isnan(X)): logger("NaN values", feat_name + ':' + stat_name, False) fail_test = True if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values", feat_name + ':' + stat_name, False) fail_test = True if not fail_test: logger("Check statistics for: ", feat_name, True) # check PCA if feat_name in all_pca: pca = ds[all_pca[feat_name]] n = ds[feat_name].shape[0] nb_feats = ds[feat_name].shape[-1] fail_test = False # performing PCA on random samples for i in range(nb_samples): start = np.random.randint(0, n - nb_samples - 1) X = pca.transform( ds[feat_name][start:(start + nb_samples)], n_components=max(nb_feats // 2, 1)) if np.any(np.isnan(X)): logger("NaN values in PCA", feat_name, False) fail_test = True break if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values in PCA", feat_name, False) fail_test = True break if not fail_test: logger("Check PCA for: ", feat_name, True) # ====== Do sampling ====== # np.random.seed(seed) # seed for reproceducible all_samples = np.random.choice(list(ds['indices'].keys()), size=nb_samples, replace=False) # plotting all samples for sample_id, file_name in enumerate(all_samples): X = {} for feat_name in all_features: start, end = ds.find_prefix(feat_name, 'indices')[file_name] feat = ds[feat_name][start:end] X[feat_name] = feat # some special handling try: _special_cases(X=feat, feat_name=feat_name, file_name=file_name, ds=ds, path=path) except Exception as e: logger("Special case error: %s" % str(e), file_name + ':' + feat_name, False) plot_multiple_features(X, title=file_name, fig_width=fig_width) figure_path = os.path.join(path, '%s.pdf' % _escape_file_name(file_name)) plot_save(figure_path, log=False, clear_all=True) logger("Sample figure saved at: ", figure_path, True) # plotting the statistic figure_path = os.path.join(path, 'stats.pdf') for feat_name, stat_name in all_stats.items(): X = {name: ds[name][:] for name in stat_name if ds[name].ndim >= 1} if len(X) > 0: plot_multiple_features(X, title=feat_name, fig_width=fig_width) plot_save(figure_path, log=False, clear_all=True) logger("Stats figure save at: ", figure_path, True) logger("All reports at folder: ", os.path.abspath(path), True) # ====== cleaning ====== # stdio(path=prev_stdio) if should_close_ds: ds.close()