args = argparser.parse_args() p = Config(args.config) if not p['use_gpu_if_available']: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' t0 = time.time() data_path = 'data' if p['os']: data_path += '_os' if p['root']: data_path += '_root' data_path += '.obj' stderr('Loading saved training data...\n') sys.stderr.flush() data_map = { 'train': p.train_data_dir, 'dev': p.dev_data_dir, 'cv': p.dev_data_dir, 'test': p.test_data_dir } with open(data_map[args.partition] + '/' + data_name, 'rb') as f: data = pickle.load(f) dnnseg_model = load_dnnseg(p.outdir) segments, _, summary = dnnseg_model.classify_utterances(
data_dirs = p.train_data_dir.split(';') train_data = Dataset(data_dirs, datatype=p['data_type'].lower(), filter_type=p['filter_type'].lower(), n_coef=p['n_coef'], order=p['order'], force_preprocess=args.preprocess, save_preprocessed_data=True) if p['oracle_boundaries']: for x in p['oracle_boundaries'].split(): if x.startswith('rnd'): length = float(x[3:]) train_data.initialize_random_segmentation(length, save=True) stderr('=' * 50 + '\n') stderr('TRAINING DATA SUMMARY\n\n') stderr(train_data.summary(indent=2)) stderr('=' * 50 + '\n\n') if p.train_data_dir != p.val_data_dir: data_dirs = p.val_data_dir.split(';') val_data = Dataset(data_dirs, datatype=p['data_type'].lower(), filter_type=p['filter_type'].lower(), n_coef=p['n_coef'], order=p['order'], force_preprocess=args.preprocess, save_preprocessed_data=True) if p['oracle_boundaries']: for x in p['oracle_boundaries'].split():
def probe(segment_table, class_types, lang=None, classifier_type='mlp', regularization_scale=0.001, max_depth=None, min_impurity_decrease=0., n_estimators=100, n_folds=2, units=100, compare_to_baseline=False, dump_images=False, verbose=False, name='probe', outdir='./probe/'): if not os.path.exists(outdir): os.makedirs(outdir) target_col_names = get_target_cols(class_types, lang=lang) X = segment_table input_col_names = [c for c in X.columns if is_embedding_dimension.match(c)] target_col_names_cur = [] df_cols = set(X.columns) for target_col in target_col_names: if target_col in df_cols: target_col_names_cur.append(target_col) else: sys.stderr.write('Ignoring unrecognized target column "%s"...\n' % target_col) sys.stderr.flush() precision = {} recall = {} f1 = {} accuracy = {} precision_baseline = {} recall_baseline = {} f1_baseline = {} accuracy_baseline = {} out_dict = {} if len(target_col_names_cur): for target_col in target_col_names_cur: if verbose: stderr(' Variable: "%s"\n' % target_col) X_cur = X[(~X[target_col].isnull()) & (~X[target_col].isin(['SIL', 'SPN']))] fold_size = math.ceil(float(len(X_cur)) / n_folds) if fold_size: perm, perm_inv = get_random_permutation(len(X_cur)) y = X_cur[target_col] if pd.api.types.is_string_dtype(y) or len(y.unique()) > 2: avg_method = 'macro' else: avg_method = 'binary' if y.sum( ) > len(y) / 2: # Majority class is positive, flip y = 1 - y label_set, label_counts = np.unique(y.values, return_counts=True) label_probs = label_counts / label_counts.sum() if verbose: sys.stderr.write('\r Label proportions:\n') sys.stderr.flush() for level, prob in zip(label_set, label_probs): sys.stderr.write('\r %s: %s\n' % (level, prob)) sys.stderr.flush() predictions = [] gold = [] for j in range(0, len(X_cur), fold_size): if verbose: sys.stderr.write('\r Fold %d/%d...' % (int(j / fold_size) + 1, math.ceil(len(X_cur) / fold_size))) sys.stderr.flush() if classifier_type.lower() == 'random_forest': classifier = RandomForestClassifier( n_estimators=n_estimators, criterion='entropy', class_weight='balanced', max_depth=max_depth, min_impurity_decrease=min_impurity_decrease) elif classifier_type.lower() in [ 'mlr', 'logreg', 'logistic_regression' ]: classifier = LogisticRegression( class_weight='balanced', C=regularization_scale, solver='lbfgs', multi_class='auto', max_iter=100) elif classifier_type.lower() in ['mlp', 'neural_network']: if isinstance(units, str): units = [int(x) for x in units.split()] if not (isinstance(units, list) or isinstance(units, tuple)): units = [int(units)] classifier = MLPClassifier(units, alpha=regularization_scale) train_select = np.ones(len(X_cur)).astype('bool') train_select[j:j + fold_size] = False cv_select = np.logical_not(train_select) train_select = train_select[perm_inv] X_train = X_cur[input_col_names][train_select] y_train = y[train_select] if len(y_train.unique()) < 2: break X_cv = X_cur[input_col_names][cv_select] y_cv = y[cv_select] classifier.fit(X_train, y_train) predictions.append(classifier.predict(X_cv)) gold.append(y_cv) if len(predictions): predictions = np.concatenate(predictions, axis=0) gold = np.concatenate(gold, axis=0) precision[target_col] = precision_score(gold, predictions, average=avg_method) recall[target_col] = recall_score(gold, predictions, average=avg_method) f1[target_col] = f1_score(gold, predictions, average=avg_method) accuracy[target_col] = accuracy_score(gold, predictions) if verbose: stderr( '\n Cross-validation F1 for variable "%s": %.4f\n' % (target_col, f1[target_col])) if compare_to_baseline: predictions_baseline = np.random.choice( label_set, size=(len(gold), ), p=label_probs) precision_baseline[target_col] = precision_score( gold, predictions_baseline, average=avg_method) recall_baseline[target_col] = recall_score( gold, predictions_baseline, average=avg_method) f1_baseline[target_col] = f1_score( gold, predictions_baseline, average=avg_method) accuracy_baseline[target_col] = accuracy_score( gold, predictions_baseline) if verbose: stderr( ' Baseline F1 for variable "%s": %.4f\n' % (target_col, f1_baseline[target_col])) if dump_images and classifier_type.lower( ) == 'random_forest': tree_ix = np.random.randint(n_estimators) graph = export_graphviz(classifier[tree_ix], feature_names=input_col_names, class_names=[ '-%s' % target_col, '+%s' % target_col ], rounded=True, proportion=False, precision=2, filled=True) (graph, ) = pydot.graph_from_dot_data(graph) img_str = '/%s_decision_tree_%s.png' outfile = outdir + img_str % (name, target_col) graph.write_png(outfile) if len(precision): macro_avg = { 'precision': sum(precision[x] for x in precision) / sum(1 for _ in precision), 'recall': sum(recall[x] for x in recall) / sum(1 for _ in recall), 'f1': sum(f1[x] for x in f1) / sum(1 for _ in f1), 'accuracy': sum(accuracy[x] for x in accuracy) / sum(1 for _ in accuracy) } if verbose: stderr(' Model macro averages:\n') stderr(' P: %.4f\n' % macro_avg['precision']) stderr(' R: %.4f\n' % macro_avg['recall']) stderr(' F1: %.4f\n' % macro_avg['f1']) stderr(' ACC: %.4f\n' % macro_avg['accuracy']) if compare_to_baseline: macro_avg_baseline = { 'precision': sum(precision_baseline[x] for x in precision_baseline) / sum(1 for _ in precision_baseline), 'recall': sum(recall_baseline[x] for x in recall_baseline) / sum(1 for _ in recall_baseline), 'f1': sum(f1_baseline[x] for x in f1_baseline) / sum(1 for _ in f1_baseline), 'accuracy': sum(accuracy_baseline[x] for x in accuracy_baseline) / sum(1 for _ in accuracy_baseline) } if verbose: stderr(' Baseline macro averages:\n') stderr(' P: %.4f\n' % macro_avg_baseline['precision']) stderr(' R: %.4f\n' % macro_avg_baseline['recall']) stderr(' F1: %.4f\n' % macro_avg_baseline['f1']) stderr(' ACC: %.4f\n' % macro_avg_baseline['accuracy']) path_str = '/%s_classifier_scores.txt' outfile = outdir + path_str % name with open(outfile, 'w') as f: f.write('feature precision recall f1 accuracy\n') for c in sorted(list(f1.keys())): f.write('%s %s %s %s %s\n' % (c, precision[c], recall[c], f1[c], accuracy[c])) f.write('MACRO %s %s %s %s\n' % (macro_avg['precision'], macro_avg['recall'], macro_avg['f1'], macro_avg['accuracy'])) if compare_to_baseline: path_str = '/%s_baseline_scores.txt' outfile = outdir + path_str % name with open(outfile, 'w') as f: f.write('feature precision recall f1 accuracy\n') for c in sorted(list(f1.keys())): f.write('%s %s %s %s %s\n' % (c, precision_baseline[c], recall_baseline[c], f1_baseline[c], accuracy_baseline[c])) f.write('MACRO %s %s %s %s\n' % (macro_avg_baseline['precision'], macro_avg_baseline['recall'], macro_avg_baseline['f1'], macro_avg_baseline['accuracy'])) for c in sorted(list(f1.keys())): key_base = '_'.join([name, c]) out_dict[key_base + '_p'] = precision[c] out_dict[key_base + '_r'] = recall[c] out_dict[key_base + '_f1'] = f1[c] if compare_to_baseline: out_dict[key_base + '_baseline_p'] = precision_baseline[c] out_dict[key_base + '_baseline_r'] = recall_baseline[c] out_dict[key_base + '_baseline_f1'] = f1_baseline[c] out_dict['_'.join([name, 'macro_p'])] = macro_avg['precision'] out_dict['_'.join([name, 'macro_r'])] = macro_avg['recall'] out_dict['_'.join([name, 'macro_f1'])] = macro_avg['f1'] out_dict['_'.join([name, 'macro_acc'])] = macro_avg['accuracy'] if compare_to_baseline: out_dict['_'.join([name, 'baseline_macro_p' ])] = macro_avg_baseline['precision'] out_dict['_'.join([name, 'baseline_macro_r' ])] = macro_avg_baseline['recall'] out_dict['_'.join([name, 'baseline_macro_f1' ])] = macro_avg_baseline['f1'] out_dict['_'.join([name, 'baseline_macro_acc' ])] = macro_avg_baseline['accuracy'] if verbose: sys.stderr.write('\n') sys.stderr.flush() return out_dict
'--layers', default=None, nargs='+', help= 'IDs of layers to plot (0, ..., L). If unspecified, plots all available layers.' ) args = argparser.parse_args() if args.class_limit.lower() == 'inf': class_limit = np.inf else: class_limit = int(args.class_limit) for config_path in args.config: p = Config(config_path) stderr('Plotting model defined at path %s...\n' % p['outdir']) if p['label_map_file'] is not None and os.path.exists( p['label_map_file']): label_map = pd.read_csv(p['label_map_file']) label_map = dict(zip(label_map.source, label_map.target)) else: label_map = None if p['feature_map_file'] is not None and os.path.exists( p['feature_map_file']): feature_table = pd.read_csv(p['feature_map_file']) new_cols = [] for i in range(len(feature_table.columns)): new_cols.append(feature_table.columns[i].replace(' ', '_').replace(
preprocessed = True if val_data is None: train_data_dirs = p.train_data_dir.split() data_dirs = p.val_data_dir.split(';') val_data = Dataset(data_dirs, datatype=p['data_type'].lower(), filter_type=p['filter_type'].lower(), n_coef=p['n_coef'], order=p['order'], force_preprocess=args.preprocess, save_preprocessed_data=p.save_preprocessed_data) stderr('=' * 50 + '\n') stderr('VALIDATION DATA SUMMARY\n\n') stderr(val_data.summary(indent=2)) stderr('=' * 50 + '\n\n') t1 = time.time() stderr('Data loaded in %ds\n\n' % (t1 - t0)) sys.stderr.flush() if p['segtype'] == 'rnd': val_data.initialize_random_segmentation(7.4153) data = val_data stderr('Caching data...\n')
xitsonga_vad.start = xitsonga_vad.start.round(3) xitsonga_vad.end = xitsonga_vad.end.round(3) xitsonga_wrd = pd.read_csv(args.tde + 'bin/resources/xitsonga.wrd', sep=' ', header=None, names=['fileID', 'start', 'end', 'label']) xitsonga_wrd.start = xitsonga_wrd.start.round(3) xitsonga_wrd.end = xitsonga_wrd.end.round(3) xitsonga_phn = pd.read_csv(args.tde + 'bin/resources/xitsonga.phn', sep=' ', header=None, names=['fileID', 'start', 'end', 'label']) xitsonga_phn.start = xitsonga_phn.start.round(3) xitsonga_phn.end = xitsonga_phn.end.round(3) stderr('Processing Buckeye Speech Corpus data...\n') if args.bsc is not None: if not os.path.exists(args.outdir + '/sample'): os.makedirs(args.outdir + '/sample') for fileID in sample_files: subject = fileID[:3] try: in_path = args.bsc + '/' + subject + '/' + fileID + '/' + fileID + '.wav' out_path = args.outdir + '/sample/' + fileID + '.wav' shutil.copy2(in_path, out_path) except: in_path = args.bsc + '/' + subject + '/' + fileID + '.wav' out_path = args.outdir + '/sample/' + fileID + '.wav' shutil.copy2(in_path, out_path)
train_data = Dataset(data_dirs, datatype=p['data_type'].lower(), filter_type=p['filter_type'].lower(), n_coef=p['n_coef'], order=p['order'], force_preprocess=False, save_preprocessed_data=True) if p['oracle_boundaries']: for x in p['oracle_boundaries'].split(): if x.startswith('rnd'): length = float(x[3:]) train_data.initialize_random_segmentation( length, save=True) if args.verbose: stderr('=' * 50 + '\n') stderr('TRAINING DATA SUMMARY\n\n') stderr(train_data.summary(indent=2)) stderr('=' * 50 + '\n\n') else: train_data = None for i, dataset in enumerate(args.data): info_dict = {} if args.names: name = args.names[i] else: name = sn(dataset) if args.language is None:
embeddings = [ x for x in os.listdir(p.outdir) if x.startswith('embeddings') and x.endswith('.csv') ] for e in embeddings: segtype, l = emb_file.match(e).groups() if args.layers is None or l in args.layers: outdir_cur = outdir + '/' + segtype + '/l' + l + '/' + args.method if not os.path.exists(outdir_cur): os.makedirs(outdir_cur) df = pd.read_csv(p.outdir + '/' + e, sep=' ') stderr( 'Projecting using %s. Segtype = %s. Layer = %s. Segments = %d.\n' % (args.method.upper(), segtype, l, len(df))) sys.stderr.flush() df = project_matching_segments(df, method=args.method) stderr('Plotting...\n') sys.stderr.flush() plot_projections(df, label_map=label_map, feature_table=feature_table, feature_names=feature_names, directory=outdir_cur, prefix='l%s_' % l, suffix='.png')