Example #1
0
    args = argparser.parse_args()

    p = Config(args.config)

    if not p['use_gpu_if_available']:
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

    t0 = time.time()
    data_path = 'data'
    if p['os']:
        data_path += '_os'
    if p['root']:
        data_path += '_root'
    data_path += '.obj'

    stderr('Loading saved training data...\n')
    sys.stderr.flush()

    data_map = {
        'train': p.train_data_dir,
        'dev': p.dev_data_dir,
        'cv': p.dev_data_dir,
        'test': p.test_data_dir
    }

    with open(data_map[args.partition] + '/' + data_name, 'rb') as f:
        data = pickle.load(f)

    dnnseg_model = load_dnnseg(p.outdir)

    segments, _, summary = dnnseg_model.classify_utterances(
Example #2
0
    data_dirs = p.train_data_dir.split(';')
    train_data = Dataset(data_dirs,
                         datatype=p['data_type'].lower(),
                         filter_type=p['filter_type'].lower(),
                         n_coef=p['n_coef'],
                         order=p['order'],
                         force_preprocess=args.preprocess,
                         save_preprocessed_data=True)
    if p['oracle_boundaries']:
        for x in p['oracle_boundaries'].split():
            if x.startswith('rnd'):
                length = float(x[3:])
                train_data.initialize_random_segmentation(length, save=True)

    stderr('=' * 50 + '\n')
    stderr('TRAINING DATA SUMMARY\n\n')
    stderr(train_data.summary(indent=2))
    stderr('=' * 50 + '\n\n')

    if p.train_data_dir != p.val_data_dir:
        data_dirs = p.val_data_dir.split(';')
        val_data = Dataset(data_dirs,
                           datatype=p['data_type'].lower(),
                           filter_type=p['filter_type'].lower(),
                           n_coef=p['n_coef'],
                           order=p['order'],
                           force_preprocess=args.preprocess,
                           save_preprocessed_data=True)
        if p['oracle_boundaries']:
            for x in p['oracle_boundaries'].split():
Example #3
0
def probe(segment_table,
          class_types,
          lang=None,
          classifier_type='mlp',
          regularization_scale=0.001,
          max_depth=None,
          min_impurity_decrease=0.,
          n_estimators=100,
          n_folds=2,
          units=100,
          compare_to_baseline=False,
          dump_images=False,
          verbose=False,
          name='probe',
          outdir='./probe/'):
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    target_col_names = get_target_cols(class_types, lang=lang)

    X = segment_table
    input_col_names = [c for c in X.columns if is_embedding_dimension.match(c)]
    target_col_names_cur = []
    df_cols = set(X.columns)
    for target_col in target_col_names:
        if target_col in df_cols:
            target_col_names_cur.append(target_col)
        else:
            sys.stderr.write('Ignoring unrecognized target column "%s"...\n' %
                             target_col)
            sys.stderr.flush()

    precision = {}
    recall = {}
    f1 = {}
    accuracy = {}

    precision_baseline = {}
    recall_baseline = {}
    f1_baseline = {}
    accuracy_baseline = {}

    out_dict = {}

    if len(target_col_names_cur):
        for target_col in target_col_names_cur:
            if verbose:
                stderr('  Variable: "%s"\n' % target_col)

            X_cur = X[(~X[target_col].isnull())
                      & (~X[target_col].isin(['SIL', 'SPN']))]
            fold_size = math.ceil(float(len(X_cur)) / n_folds)
            if fold_size:
                perm, perm_inv = get_random_permutation(len(X_cur))
                y = X_cur[target_col]
                if pd.api.types.is_string_dtype(y) or len(y.unique()) > 2:
                    avg_method = 'macro'
                else:
                    avg_method = 'binary'
                    if y.sum(
                    ) > len(y) / 2:  # Majority class is positive, flip
                        y = 1 - y

                label_set, label_counts = np.unique(y.values,
                                                    return_counts=True)
                label_probs = label_counts / label_counts.sum()

                if verbose:
                    sys.stderr.write('\r    Label proportions:\n')
                    sys.stderr.flush()
                    for level, prob in zip(label_set, label_probs):
                        sys.stderr.write('\r      %s: %s\n' % (level, prob))
                        sys.stderr.flush()

                predictions = []
                gold = []

                for j in range(0, len(X_cur), fold_size):
                    if verbose:
                        sys.stderr.write('\r    Fold %d/%d...' %
                                         (int(j / fold_size) + 1,
                                          math.ceil(len(X_cur) / fold_size)))
                        sys.stderr.flush()
                    if classifier_type.lower() == 'random_forest':
                        classifier = RandomForestClassifier(
                            n_estimators=n_estimators,
                            criterion='entropy',
                            class_weight='balanced',
                            max_depth=max_depth,
                            min_impurity_decrease=min_impurity_decrease)
                    elif classifier_type.lower() in [
                            'mlr', 'logreg', 'logistic_regression'
                    ]:
                        classifier = LogisticRegression(
                            class_weight='balanced',
                            C=regularization_scale,
                            solver='lbfgs',
                            multi_class='auto',
                            max_iter=100)
                    elif classifier_type.lower() in ['mlp', 'neural_network']:
                        if isinstance(units, str):
                            units = [int(x) for x in units.split()]
                        if not (isinstance(units, list)
                                or isinstance(units, tuple)):
                            units = [int(units)]
                        classifier = MLPClassifier(units,
                                                   alpha=regularization_scale)

                    train_select = np.ones(len(X_cur)).astype('bool')
                    train_select[j:j + fold_size] = False
                    cv_select = np.logical_not(train_select)
                    train_select = train_select[perm_inv]

                    X_train = X_cur[input_col_names][train_select]
                    y_train = y[train_select]
                    if len(y_train.unique()) < 2:
                        break
                    X_cv = X_cur[input_col_names][cv_select]
                    y_cv = y[cv_select]

                    classifier.fit(X_train, y_train)
                    predictions.append(classifier.predict(X_cv))
                    gold.append(y_cv)

                if len(predictions):
                    predictions = np.concatenate(predictions, axis=0)
                    gold = np.concatenate(gold, axis=0)
                    precision[target_col] = precision_score(gold,
                                                            predictions,
                                                            average=avg_method)
                    recall[target_col] = recall_score(gold,
                                                      predictions,
                                                      average=avg_method)
                    f1[target_col] = f1_score(gold,
                                              predictions,
                                              average=avg_method)
                    accuracy[target_col] = accuracy_score(gold, predictions)

                    if verbose:
                        stderr(
                            '\n    Cross-validation F1 for variable "%s": %.4f\n'
                            % (target_col, f1[target_col]))

                    if compare_to_baseline:
                        predictions_baseline = np.random.choice(
                            label_set, size=(len(gold), ), p=label_probs)
                        precision_baseline[target_col] = precision_score(
                            gold, predictions_baseline, average=avg_method)
                        recall_baseline[target_col] = recall_score(
                            gold, predictions_baseline, average=avg_method)
                        f1_baseline[target_col] = f1_score(
                            gold, predictions_baseline, average=avg_method)
                        accuracy_baseline[target_col] = accuracy_score(
                            gold, predictions_baseline)

                        if verbose:
                            stderr(
                                '    Baseline F1 for variable "%s":         %.4f\n'
                                % (target_col, f1_baseline[target_col]))

                    if dump_images and classifier_type.lower(
                    ) == 'random_forest':
                        tree_ix = np.random.randint(n_estimators)

                        graph = export_graphviz(classifier[tree_ix],
                                                feature_names=input_col_names,
                                                class_names=[
                                                    '-%s' % target_col,
                                                    '+%s' % target_col
                                                ],
                                                rounded=True,
                                                proportion=False,
                                                precision=2,
                                                filled=True)

                        (graph, ) = pydot.graph_from_dot_data(graph)

                        img_str = '/%s_decision_tree_%s.png'

                        outfile = outdir + img_str % (name, target_col)
                        graph.write_png(outfile)

        if len(precision):
            macro_avg = {
                'precision':
                sum(precision[x] for x in precision) / sum(1
                                                           for _ in precision),
                'recall':
                sum(recall[x] for x in recall) / sum(1 for _ in recall),
                'f1':
                sum(f1[x] for x in f1) / sum(1 for _ in f1),
                'accuracy':
                sum(accuracy[x] for x in accuracy) / sum(1 for _ in accuracy)
            }

            if verbose:
                stderr('    Model macro averages:\n')
                stderr('      P:   %.4f\n' % macro_avg['precision'])
                stderr('      R:   %.4f\n' % macro_avg['recall'])
                stderr('      F1:  %.4f\n' % macro_avg['f1'])
                stderr('      ACC: %.4f\n' % macro_avg['accuracy'])

            if compare_to_baseline:
                macro_avg_baseline = {
                    'precision':
                    sum(precision_baseline[x] for x in precision_baseline) /
                    sum(1 for _ in precision_baseline),
                    'recall':
                    sum(recall_baseline[x] for x in recall_baseline) /
                    sum(1 for _ in recall_baseline),
                    'f1':
                    sum(f1_baseline[x]
                        for x in f1_baseline) / sum(1 for _ in f1_baseline),
                    'accuracy':
                    sum(accuracy_baseline[x] for x in accuracy_baseline) /
                    sum(1 for _ in accuracy_baseline)
                }

                if verbose:
                    stderr('    Baseline macro averages:\n')
                    stderr('      P:   %.4f\n' %
                           macro_avg_baseline['precision'])
                    stderr('      R:   %.4f\n' % macro_avg_baseline['recall'])
                    stderr('      F1:  %.4f\n' % macro_avg_baseline['f1'])
                    stderr('      ACC: %.4f\n' %
                           macro_avg_baseline['accuracy'])

            path_str = '/%s_classifier_scores.txt'
            outfile = outdir + path_str % name
            with open(outfile, 'w') as f:
                f.write('feature precision recall f1 accuracy\n')
                for c in sorted(list(f1.keys())):
                    f.write('%s %s %s %s %s\n' %
                            (c, precision[c], recall[c], f1[c], accuracy[c]))
                f.write('MACRO %s %s %s %s\n' %
                        (macro_avg['precision'], macro_avg['recall'],
                         macro_avg['f1'], macro_avg['accuracy']))

            if compare_to_baseline:
                path_str = '/%s_baseline_scores.txt'
                outfile = outdir + path_str % name
                with open(outfile, 'w') as f:
                    f.write('feature precision recall f1 accuracy\n')
                    for c in sorted(list(f1.keys())):
                        f.write('%s %s %s %s %s\n' %
                                (c, precision_baseline[c], recall_baseline[c],
                                 f1_baseline[c], accuracy_baseline[c]))
                    f.write('MACRO %s %s %s %s\n' %
                            (macro_avg_baseline['precision'],
                             macro_avg_baseline['recall'],
                             macro_avg_baseline['f1'],
                             macro_avg_baseline['accuracy']))

            for c in sorted(list(f1.keys())):
                key_base = '_'.join([name, c])
                out_dict[key_base + '_p'] = precision[c]
                out_dict[key_base + '_r'] = recall[c]
                out_dict[key_base + '_f1'] = f1[c]

                if compare_to_baseline:
                    out_dict[key_base + '_baseline_p'] = precision_baseline[c]
                    out_dict[key_base + '_baseline_r'] = recall_baseline[c]
                    out_dict[key_base + '_baseline_f1'] = f1_baseline[c]

            out_dict['_'.join([name, 'macro_p'])] = macro_avg['precision']
            out_dict['_'.join([name, 'macro_r'])] = macro_avg['recall']
            out_dict['_'.join([name, 'macro_f1'])] = macro_avg['f1']
            out_dict['_'.join([name, 'macro_acc'])] = macro_avg['accuracy']

            if compare_to_baseline:
                out_dict['_'.join([name, 'baseline_macro_p'
                                   ])] = macro_avg_baseline['precision']
                out_dict['_'.join([name, 'baseline_macro_r'
                                   ])] = macro_avg_baseline['recall']
                out_dict['_'.join([name, 'baseline_macro_f1'
                                   ])] = macro_avg_baseline['f1']
                out_dict['_'.join([name, 'baseline_macro_acc'
                                   ])] = macro_avg_baseline['accuracy']

            if verbose:
                sys.stderr.write('\n')
                sys.stderr.flush()

    return out_dict
Example #4
0
        '--layers',
        default=None,
        nargs='+',
        help=
        'IDs of layers to plot (0, ..., L). If unspecified, plots all available layers.'
    )
    args = argparser.parse_args()

    if args.class_limit.lower() == 'inf':
        class_limit = np.inf
    else:
        class_limit = int(args.class_limit)

    for config_path in args.config:
        p = Config(config_path)
        stderr('Plotting model defined at path %s...\n' % p['outdir'])

        if p['label_map_file'] is not None and os.path.exists(
                p['label_map_file']):
            label_map = pd.read_csv(p['label_map_file'])
            label_map = dict(zip(label_map.source, label_map.target))
        else:
            label_map = None

        if p['feature_map_file'] is not None and os.path.exists(
                p['feature_map_file']):
            feature_table = pd.read_csv(p['feature_map_file'])
            new_cols = []
            for i in range(len(feature_table.columns)):
                new_cols.append(feature_table.columns[i].replace(' ',
                                                                 '_').replace(
Example #5
0
        preprocessed = True

        if val_data is None:
            train_data_dirs = p.train_data_dir.split()

            data_dirs = p.val_data_dir.split(';')
            val_data = Dataset(data_dirs,
                               datatype=p['data_type'].lower(),
                               filter_type=p['filter_type'].lower(),
                               n_coef=p['n_coef'],
                               order=p['order'],
                               force_preprocess=args.preprocess,
                               save_preprocessed_data=p.save_preprocessed_data)

            stderr('=' * 50 + '\n')
            stderr('VALIDATION DATA SUMMARY\n\n')
            stderr(val_data.summary(indent=2))
            stderr('=' * 50 + '\n\n')

            t1 = time.time()

            stderr('Data loaded in %ds\n\n' % (t1 - t0))
            sys.stderr.flush()

            if p['segtype'] == 'rnd':
                val_data.initialize_random_segmentation(7.4153)

            data = val_data

            stderr('Caching data...\n')
Example #6
0
xitsonga_vad.start = xitsonga_vad.start.round(3)
xitsonga_vad.end = xitsonga_vad.end.round(3)
xitsonga_wrd = pd.read_csv(args.tde + 'bin/resources/xitsonga.wrd',
                           sep=' ',
                           header=None,
                           names=['fileID', 'start', 'end', 'label'])
xitsonga_wrd.start = xitsonga_wrd.start.round(3)
xitsonga_wrd.end = xitsonga_wrd.end.round(3)
xitsonga_phn = pd.read_csv(args.tde + 'bin/resources/xitsonga.phn',
                           sep=' ',
                           header=None,
                           names=['fileID', 'start', 'end', 'label'])
xitsonga_phn.start = xitsonga_phn.start.round(3)
xitsonga_phn.end = xitsonga_phn.end.round(3)

stderr('Processing Buckeye Speech Corpus data...\n')
if args.bsc is not None:
    if not os.path.exists(args.outdir + '/sample'):
        os.makedirs(args.outdir + '/sample')

    for fileID in sample_files:
        subject = fileID[:3]
        try:
            in_path = args.bsc + '/' + subject + '/' + fileID + '/' + fileID + '.wav'
            out_path = args.outdir + '/sample/' + fileID + '.wav'
            shutil.copy2(in_path, out_path)
        except:
            in_path = args.bsc + '/' + subject + '/' + fileID + '.wav'
            out_path = args.outdir + '/sample/' + fileID + '.wav'
            shutil.copy2(in_path, out_path)
Example #7
0
                    train_data = Dataset(data_dirs,
                                         datatype=p['data_type'].lower(),
                                         filter_type=p['filter_type'].lower(),
                                         n_coef=p['n_coef'],
                                         order=p['order'],
                                         force_preprocess=False,
                                         save_preprocessed_data=True)
                    if p['oracle_boundaries']:
                        for x in p['oracle_boundaries'].split():
                            if x.startswith('rnd'):
                                length = float(x[3:])
                                train_data.initialize_random_segmentation(
                                    length, save=True)

                    if args.verbose:
                        stderr('=' * 50 + '\n')
                        stderr('TRAINING DATA SUMMARY\n\n')
                        stderr(train_data.summary(indent=2))
                        stderr('=' * 50 + '\n\n')
                else:
                    train_data = None

        for i, dataset in enumerate(args.data):
            info_dict = {}

            if args.names:
                name = args.names[i]
            else:
                name = sn(dataset)

            if args.language is None:
Example #8
0
    embeddings = [
        x for x in os.listdir(p.outdir)
        if x.startswith('embeddings') and x.endswith('.csv')
    ]

    for e in embeddings:
        segtype, l = emb_file.match(e).groups()
        if args.layers is None or l in args.layers:
            outdir_cur = outdir + '/' + segtype + '/l' + l + '/' + args.method
            if not os.path.exists(outdir_cur):
                os.makedirs(outdir_cur)
            df = pd.read_csv(p.outdir + '/' + e, sep=' ')

            stderr(
                'Projecting using %s. Segtype = %s. Layer = %s. Segments = %d.\n'
                % (args.method.upper(), segtype, l, len(df)))
            sys.stderr.flush()

            df = project_matching_segments(df, method=args.method)

            stderr('Plotting...\n')
            sys.stderr.flush()

            plot_projections(df,
                             label_map=label_map,
                             feature_table=feature_table,
                             feature_names=feature_names,
                             directory=outdir_cur,
                             prefix='l%s_' % l,
                             suffix='.png')