Exemple #1
0
def _boundary_sub(disc_clsdict,
                  corpus,
                  names,
                  label,
                  verbose,
                  n_jobs,
                  threshold=0.03):
    eb = eval_from_bounds
    if verbose:
        print '  boundary ({2}): subsampled {0} files in {1} sets'\
            .format(sum(map(len, names)), len(names), label)
    with verb_print('  boundary ({0}): calculating scores'.format(label),
                    verbose, True, True, True):
        disc_bounds = [
            Boundaries(disc_clsdict.restrict(ns), threshold=threshold)
            for ns in names
        ]
        gold_bounds = [
            Boundaries(corpus.restrict(ns), threshold=threshold)
            for ns in names
        ]
    with verb_print('  boundary ({0}): calculating scores'.format(label),
                    verbose, False, True, False):
        p, r = izip(*Parallel(n_jobs=n_jobs, verbose=5 if verbose else 0,
                              pre_dispatch='2*n_jobs') \
                    (delayed(eb)(disc, gold)
                     for disc, gold in zip(disc_bounds, gold_bounds)))
    p, r = np.fromiter(p, dtype=np.double), np.fromiter(r, dtype=np.double)
    p, r = praggregate(p, r)
    return p, r
Exemple #2
0
def _match_sub(disc_clsdict, gold_clsdict, phn_corpus, names, label,
               verbose, n_jobs):
    em = eval_from_psets
    if verbose:
        print '  matching ({2}): subsampled {0} files in {1} sets'\
            .format(sum(map(len, names)), len(names), label)
    with verb_print('  matching ({0}): prepping psets'.format(label),
                             verbose, True, True, True):
        pdiscs = [make_pdisc(disc_clsdict.restrict(fs, True),
                             False, False)
                  for fs in names]
        pgolds = [make_pgold(gold_clsdict.restrict(fs, True),
                             False, False)
                  for fs in names]
        psubs = [make_psubs(disc_clsdict.restrict(fs, True),
                            phn_corpus, 3, 20, False, False)
                 for fs in names]
    with verb_print('  matching ({0}): calculating scores'
                             .format(label), verbose, False, True, False):
        tp, tr = izip(*Parallel(n_jobs=n_jobs,
                                verbose=5 if verbose else 0,
                                pre_dispatch='n_jobs')
                      (delayed(em)(pdisc, pgold, psub)
                      for pdisc, pgold, psub in zip(pdiscs, pgolds, psubs)))
    tp, tr = np.fromiter(tp, dtype=np.double), np.fromiter(tr, dtype=np.double)
    tp, tr = praggregate(tp, tr)
    return tp, tr
Exemple #3
0
def _match_sub(disc_clsdict, gold_clsdict, phn_corpus, names, label, verbose,
               n_jobs):
    em = eval_from_psets
    if verbose:
        print '  matching ({2}): subsampled {0} files in {1} sets'\
            .format(sum(map(len, names)), len(names), label)
    with verb_print('  matching ({0}): prepping psets'.format(label), verbose,
                    True, True, True):
        pdiscs = [
            make_pdisc(disc_clsdict.restrict(fs, True), False, False)
            for fs in names
        ]
        pgolds = [
            make_pgold(gold_clsdict.restrict(fs, True), False, False)
            for fs in names
        ]
        psubs = [
            make_psubs(disc_clsdict.restrict(fs, True), phn_corpus, 3, 20,
                       False, False) for fs in names
        ]
    with verb_print('  matching ({0}): calculating scores'.format(label),
                    verbose, False, True, False):
        tp, tr = izip(*Parallel(
            n_jobs=n_jobs, verbose=5 if verbose else 0, pre_dispatch='n_jobs')(
                delayed(em)(pdisc, pgold, psub)
                for pdisc, pgold, psub in zip(pdiscs, pgolds, psubs)))
    tp, tr = np.fromiter(tp, dtype=np.double), np.fromiter(tr, dtype=np.double)
    tp, tr = praggregate(tp, tr)
    return tp, tr
Exemple #4
0
def _nlp_sub(disc_clsdict, gold_clsdict, names, label, verbose, n_jobs):
    # ned
    ned = NED
    cov = coverage
    if verbose:
        print '  nlp ({2}): subsampled {0} files in {1} sets'\
            .format(sum(map(len, names)), len(names), label)
    with verb_print('  nlp ({0}): calculating scores'.format(label), verbose,
                    False, True, False):
        ned_score = Parallel(n_jobs=n_jobs,
                             verbose=5 if verbose else 0,
                             pre_dispatch='n_jobs')(delayed(ned)\
                                                    (disc_clsdict.restrict(ns,
                                                                           True))
                                                    for ns in names)
        cov_score = Parallel(n_jobs=n_jobs,
                             verbose=5 if verbose else 0,
                             pre_dispatch='n_jobs')(delayed(cov)\
                                                    (disc_clsdict.restrict(ns,
                                                                           False),
                                                     gold_clsdict.restrict(ns,
                                                                           False))
                                                    for ns in names)
    # don't replace nan's by 1, but ignore them, unless all values in ned_score
    # are nan
    ned_score, cov_score = np.array(ned_score), np.array(cov_score)
    ned_score, cov_score = aggregate(ned_score, 1), aggregate(cov_score)
    return np.array(ned_score), np.array(cov_score)
Exemple #5
0
def _nlp_sub(disc_clsdict, gold_clsdict, names, label, verbose, n_jobs):
    # ned
    ned = NED
    cov = coverage
    if verbose:
        print '  nlp ({2}): subsampled {0} files in {1} sets'\
            .format(sum(map(len, names)), len(names), label)
    with verb_print('  nlp ({0}): calculating scores'
                             .format(label), verbose, False, True, False):
        ned_score = Parallel(n_jobs=n_jobs,
                             verbose=5 if verbose else 0,
                             pre_dispatch='n_jobs')(delayed(ned)\
                                                    (disc_clsdict.restrict(ns,
                                                                           True))
                                                    for ns in names)
        cov_score = Parallel(n_jobs=n_jobs,
                             verbose=5 if verbose else 0,
                             pre_dispatch='n_jobs')(delayed(cov)\
                                                    (disc_clsdict.restrict(ns,
                                                                           False),
                                                     gold_clsdict.restrict(ns,
                                                                           False))
                                                    for ns in names)
    # don't replace nan's by 1, but ignore them, unless all values in ned_score
    # are nan
    ned_score, cov_score = np.array(ned_score), np.array(cov_score)
    ned_score, cov_score = aggregate(ned_score, 1), aggregate(cov_score)
    return np.array(ned_score), np.array(cov_score)
Exemple #6
0
def make_psubs_nmatch(psubs, verbose, debug):
    with verb_print('making psubs nmatch', verbose, True, True):
        psubs_nmatch = nmatch(psubs)
    if debug:
        print banner('NMATCH(PSUBS)')
        print pformat(psubs_nmatch)
        print
    return psubs_nmatch
Exemple #7
0
def make_typeset(psubs, verbose, debug):
    with verb_print('making typeset', verbose, True, True):
        ts = list(typeset(psubs))
    if debug:
        print banner('TYPES(PSUBS) ({0})'.format(len(ts)))
        print pformat(ts)
        print
    return ts
Exemple #8
0
def make_pdisc(disc_clsdict, verbose, debug):
    with verb_print('constructing pdisc set', verbose, True, True):
        pdisc = list(Pclus(disc_clsdict))
    if debug:
        print banner('PDISC ({0})'.format(len(pdisc)))
        print pretty_pairs(pdisc)
        print
    return pdisc
Exemple #9
0
def make_pgold(gold_clsdict, verbose, debug):
    with verb_print('constructing pgold set', verbose, True, True):
        pgold = list(Pclus(gold_clsdict))
    if debug:
        print banner('PGOLD ({0})'.format(len(pgold)))
        print pretty_pairs(pgold)
        print
    return pgold
Exemple #10
0
def make_pgoldclus_nmatch(pgoldclus, verbose, debug):
    with verb_print('constructing pgoldclus_nmatch', verbose, True, True):
        pgoldclus_nmatch = nmatch(pgoldclus)
    if debug:
        print banner('NMATCH(PGOLDCLUS)')
        print pformat(pgoldclus_nmatch)
        print
    return pgoldclus_nmatch
Exemple #11
0
def make_weights(pclus, verbose, debug):
    with verb_print('constructing weights', verbose, True, True):
        ws = weights(pclus)
    if debug:
        print banner('WEIGHTS')
        print pformat(ws)
        print
    return ws
Exemple #12
0
def make_typeset(pclus, verbose, debug):
    with verb_print('constructing typeset', verbose, True, True):
        ts = list(typeset(pclus))
    if debug:
        print banner('TYPESET ({0})'.format(len(ts)))
        print pformat(ts)
        print
    return ts
Exemple #13
0
def make_psubs_nmatch(psubs, verbose, debug):
    with verb_print('making psubs nmatch', verbose, True, True):
        psubs_nmatch = nmatch(psubs)
    if debug:
        print banner('NMATCH(PSUBS)')
        print pformat(psubs_nmatch)
        print
    return psubs_nmatch
Exemple #14
0
def make_weights(psubs, verbose, debug):
    with verb_print('making weights', verbose, True, True):
        ws = weights(psubs)
    if debug:
        print banner('WEIGHTS(PSUBS) ({0})'.format(len(ws)))
        print pformat(ws)
        print
    return ws
Exemple #15
0
def make_weights(psubs, verbose, debug):
    with verb_print('making weights', verbose, True, True):
        ws = weights(psubs)
    if debug:
        print banner('WEIGHTS(PSUBS) ({0})'.format(len(ws)))
        print pformat(ws)
        print
    return ws
Exemple #16
0
def make_typeset(psubs, verbose, debug):
    with verb_print('making typeset', verbose, True, True):
        ts = list(typeset(psubs))
    if debug:
        print banner('TYPES(PSUBS) ({0})'.format(len(ts)))
        print pformat(ts)
        print
    return ts
Exemple #17
0
def make_pdisc(disc_clsdict, verbose, debug):
    with verb_print('constructing pdisc set', verbose, True, True):
        pdisc = list(Pclus(disc_clsdict))
    if debug:
        print banner('PDISC ({0})'.format(len(pdisc)))
        print pretty_pairs(pdisc)
        print
    return pdisc
Exemple #18
0
def make_pgold(gold_clsdict, verbose, debug):
    with verb_print('constructing pgold set', verbose, True, True):
        pgold = list(Pclus(gold_clsdict))
    if debug:
        print banner('PGOLD ({0})'.format(len(pgold)))
        print pretty_pairs(pgold)
        print
    return pgold
Exemple #19
0
def make_psubs_pgold_nmatch(pgold, psubs, verbose, debug):
    with verb_print('making psubs/pgold nmatch', verbose, True, True):
        psubs_pgold_intersect = intersection(pgold, psubs)
        psubs_pgold_nmatch = nmatch(psubs_pgold_intersect)
    if debug:
        print banner('NMATCH(PSUBS/PGOLD)')
        print pformat(psubs_pgold_nmatch)
        print
    return psubs_pgold_nmatch
Exemple #20
0
def make_pgoldclus(disc_clsdict, verbose, debug):
    with verb_print('constructing pgoldclus', verbose, True, True):
        pgoldclus = list(Pgoldclus(disc_clsdict))
    if debug:
        pgoldclus = list(pgoldclus)
        print banner('PGOLDCLUS ({0})'.format(len(pgoldclus)))
        print pretty_pairs(pgoldclus)
        print
    return pgoldclus
Exemple #21
0
def make_pgold_nmatch(pgold, verbose, debug):
    with verb_print('constructing nmatch_gold', verbose, True, True):
        nmatch_gold = nmatch(pgold)

    if debug:
        print banner('nmatch_gold')
        for k, v in nmatch_gold.iteritems():
            print k, v
    return nmatch_gold
Exemple #22
0
def make_pclus_pgoldclus_nmatch(pclus, pgoldclus, verbose, debug):
    with verb_print('making pclus/pgoldclus nmatch', verbose, True, True):
        pclus_pgoldclus_intersect = list(intersection(pclus, pgoldclus))
        pclus_pgoldclus_nmatch = nmatch(pclus_pgoldclus_intersect)
    if debug:
        print banner('NMATCH(PCLUS/PGOLDCLUS)')
        print pformat(pclus_pgoldclus_nmatch)
        print
    return pclus_pgoldclus_nmatch
Exemple #23
0
def make_psubs_pgold_nmatch(pgold, psubs, verbose, debug):
    with verb_print('making psubs/pgold nmatch', verbose, True, True):
        psubs_pgold_intersect = intersection(pgold, psubs)
        psubs_pgold_nmatch = nmatch(psubs_pgold_intersect)
    if debug:
        print banner('NMATCH(PSUBS/PGOLD)')
        print pformat(psubs_pgold_nmatch)
        print
    return psubs_pgold_nmatch
Exemple #24
0
def make_pgold_nmatch(pgold, verbose, debug):
    with verb_print('constructing nmatch_gold', verbose, True, True):
        nmatch_gold = nmatch(pgold)

    if debug:
        print banner('nmatch_gold')
        for k, v in nmatch_gold.iteritems():
            print k, v
    return nmatch_gold
Exemple #25
0
def make_psubs(disc_clsdict, corpus, minlength, maxlength,
               verbose, debug):
    with verb_print('constructing psubs set', verbose, True, True):
        psubs = list(Psubs(disc_clsdict, corpus, minlength=minlength,
                           maxlength=maxlength))
    if debug:
        print banner('PSUBS ({0})'.format(len(psubs)))
        print pretty_pairs(psubs)
        print
    return psubs
Exemple #26
0
def make_pclus(disc_clsdict, verbose, debug):
    with verb_print('constructing pclus', verbose, True, True):
        pclus = list(tuple(sorted((f1, f2),
                             key=lambda f: (f.name, f.interval.start)))
                     for f1, f2 in Pclus_single(disc_clsdict))
    if debug:
        print banner('PCLUS ({0})'.format(len(pclus)))
        print pretty_pairs(pclus)
        print
    return pclus
Exemple #27
0
def _boundary_sub(disc_clsdict, corpus, names, label, verbose, n_jobs):
    eb = eval_from_bounds
    if verbose:
        print '  boundary ({2}): subsampled {0} files in {1} sets'\
            .format(sum(map(len, names)), len(names), label)
    with verb_print('  boundary ({0}): calculating scores'
                             .format(label), verbose, True, True, True):
        disc_bounds = [Boundaries(disc_clsdict.restrict(ns))
                       for ns in names]
        gold_bounds = [Boundaries(corpus.restrict(ns))
                       for ns in names]
    with verb_print('  boundary ({0}): calculating scores'
                             .format(label), verbose, False, True, False):
        p, r = izip(*Parallel(n_jobs=n_jobs, verbose=5 if verbose else 0,
                              pre_dispatch='2*n_jobs') \
                    (delayed(eb)(disc, gold)
                     for disc, gold in zip(disc_bounds, gold_bounds)))
    p, r = np.fromiter(p, dtype=np.double), np.fromiter(r, dtype=np.double)
    p, r = praggregate(p, r)
    return p, r
Exemple #28
0
def make_psubs(disc_clsdict, corpus, minlength, maxlength, verbose, debug):
    with verb_print('constructing psubs set', verbose, True, True):
        psubs = list(
            Psubs(disc_clsdict,
                  corpus,
                  minlength=minlength,
                  maxlength=maxlength))
    if debug:
        print banner('PSUBS ({0})'.format(len(psubs)))
        print pretty_pairs(psubs)
        print
    return psubs
Exemple #29
0
def _group_sub(disc_clsdict, names, label, verbose, n_jobs):
    eg = evaluate_group
    if verbose:
        print '  group ({2}): subsampled {0} files in {1} sets'\
            .format(sum(map(len, names)), len(names), label)
    with verb_print('  group ({0}): calculating scores'.format(label), verbose,
                    False, True, False):
        p, r = izip(*(Parallel(
            n_jobs=n_jobs, verbose=5 if verbose else 0, pre_dispatch='n_jobs')(
                delayed(eg)(disc_clsdict.restrict(ns, True)) for ns in names)))
    p, r = np.fromiter(p, dtype=np.double), np.fromiter(r, dtype=np.double)
    p, r = praggregate(p, r)
    return p, r
Exemple #30
0
def evaluate_token_type(disc_clsdict,
                        wrd_corpus,
                        verbose=False,
                        threshold=0.03,
                        debug=False):
    n_word_tokens = iterator_length(unique(wrd_corpus.iter_fragments()))
    word_types = set(f.mark for f in wrd_corpus.iter_fragments())
    n_word_types = len(word_types)
    n_disc_fragments = iterator_length(disc_clsdict.iter_fragments())

    with verb_print('querying words', verbose, True, True, True):
        types_hit = set()
        types_seen = set()
        hits = 0
        for disc_fragment in disc_clsdict.iter_fragments():
            disc_start = disc_fragment.interval.start
            disc_end = disc_fragment.interval.end
            wrd_tokens = wrd_corpus.tokens(disc_fragment.name,
                                           disc_fragment.interval)
            types_seen.add(tuple(f.mark for f in wrd_tokens))
            if len(wrd_tokens) != 1:
                continue
            goldtok = wrd_tokens[0]
            if abs(goldtok.interval.start - disc_start) > threshold:
                continue
            if abs(goldtok.interval.end - disc_end) > threshold:
                continue
            types_hit.add(goldtok.mark)
            hits += 1

    if n_disc_fragments == 0:
        token_prec = np.nan
    else:
        token_prec = hits / n_disc_fragments

    if n_word_tokens == 0:
        token_rec = np.nan
    else:
        token_rec = hits / n_word_tokens

    if len(types_seen) == 0:
        type_prec = np.nan
    else:
        type_prec = len(types_hit) / len(types_seen)

    if n_word_types == 0:
        type_rec = np.nan
    else:
        type_rec = len(types_hit) / n_word_types

    return token_prec, token_rec, type_prec, type_rec
Exemple #31
0
def _boundary_sub(disc_clsdict, corpus, names, label, verbose, n_jobs):
    eb = eval_from_bounds
    if verbose:
        print '  boundary ({2}): subsampled {0} files in {1} sets'\
            .format(sum(map(len, names)), len(names), label)
    with verb_print('  boundary ({0}): calculating scores'.format(label),
                    verbose, True, True, True):
        disc_bounds = [Boundaries(disc_clsdict.restrict(ns)) for ns in names]
        gold_bounds = [Boundaries(corpus.restrict(ns)) for ns in names]
    #print(len(disc_bounds[0].bounds))
    acc = 0
    '''for element in disc_bounds:
        for key in element.bounds.keys():
            acc += len(element.bounds[key])
        print(acc, len(element.bounds))'''

    acc = 0
    '''for element in gold_bounds:
        for key in element.bounds.keys():
            acc += len(element.bounds[key])
        print(acc, len(element.bounds))
        acc = 0'''

    #print(element.bounds.keys()[:10], len(element.bounds))
    #print(element.bounds[element.bounds.keys()[0]])

    with verb_print('  boundary ({0}): calculating scores'.format(label),
                    verbose, False, True, False):
        p, r = izip(*Parallel(n_jobs=n_jobs, verbose=5 if verbose else 0,
                              pre_dispatch='2*n_jobs') \
                    (delayed(eb)(disc, gold)
                     for disc, gold in zip(disc_bounds, gold_bounds)))

    p, r = np.fromiter(p, dtype=np.double), np.fromiter(r, dtype=np.double)
    p, r = praggregate(p, r)
    return p, r
Exemple #32
0
def _token_type_sub(clsdict, wrd_corpus, names, label, verbose, n_jobs):
    et = evaluate_token_type
    if verbose:
        print '  token/type ({2}): subsampled {0} files in {1} sets'\
            .format(sum(map(len, names)), len(names), label)
    with verb_print('  token/type ({0}): calculating scores'
                             .format(label), verbose, False, True, False):
        pto, rto, pty, rty = izip(*(et(clsdict.restrict(ns, False),
                                       wrd_corpus.restrict(ns))
                                    for ns in names))
    pto, rto, pty, rty = np.array(pto), np.array(rto), np.array(pty), np.array(rty)
    pto, rto = praggregate(pto, rto)
    pty, rty = praggregate(pty, rty)

    return pto, rto, pty, rty
Exemple #33
0
def _group_sub(disc_clsdict, names, label, verbose, n_jobs):
    eg = evaluate_group
    if verbose:
        print '  group ({2}): subsampled {0} files in {1} sets'\
            .format(sum(map(len, names)), len(names), label)
    with verb_print('  group ({0}): calculating scores'.format(label),
                             verbose, False, True, False):
        p, r = izip(*(Parallel(n_jobs=n_jobs,
                              verbose=5 if verbose else 0,
                              pre_dispatch='n_jobs')
                     (delayed(eg)(disc_clsdict.restrict(ns, True))
                      for ns in names)))
    p, r = np.fromiter(p, dtype=np.double), np.fromiter(r, dtype=np.double)
    p, r = praggregate(p, r)
    return p, r
Exemple #34
0
def ned_sub(matches, verbose, n_jobs):
    # ned
    with verb_print('  ned: calculating scores', verbose, False, True, False):
        ned_scores = Parallel(n_jobs=n_jobs,
                              verbose=5 if verbose else 0,
                              pre_dispatch='n_jobs')(delayed(NED)
                                                     (match)
                                                     for match in matches)
        dtw_scores = [match.dtw for match in matches]

    scores = zip(ned_scores, dtw_scores)
    print(len(scores))
    scores = filter(lambda x: x[0] != np.nan, scores)
    print(len(scores))
    return zip(*scores)
Exemple #35
0
def _token_type_sub(clsdict, wrd_corpus, names, label, verbose, n_jobs):
    et = evaluate_token_type
    if verbose:
        print '  token/type ({2}): subsampled {0} files in {1} sets'\
            .format(sum(map(len, names)), len(names), label)
    with verb_print('  token/type ({0}): calculating scores'
                             .format(label), verbose, False, True, False):
        pto, rto, pty, rty = izip(*(et(clsdict.restrict(ns, False),
                                       wrd_corpus.restrict(ns))
                                    for ns in names))
    pto, rto, pty, rty = np.array(pto), np.array(rto), np.array(pty), np.array(rty)
    pto, rto = praggregate(pto, rto)
    pty, rty = praggregate(pty, rty)

    return pto, rto, pty, rty
Exemple #36
0
def evaluate_token_type(disc_clsdict, wrd_corpus,
                        verbose=False, debug=False):
    n_word_tokens = iterator_length(unique(wrd_corpus.iter_fragments()))
    word_types = set(f.mark for f in wrd_corpus.iter_fragments())
    n_word_types = len(word_types)
    n_disc_fragments = iterator_length(disc_clsdict.iter_fragments())

    with verb_print('querying words', verbose, True, True, True):
        types_hit = set()
        types_seen = set()
        hits = 0
        for disc_fragment in disc_clsdict.iter_fragments():
            disc_start = disc_fragment.interval.start
            disc_end = disc_fragment.interval.end
            wrd_tokens = wrd_corpus.tokens(disc_fragment.name,
                                           disc_fragment.interval)
            types_seen.add(tuple(f.mark for f in wrd_tokens))
            if len(wrd_tokens) != 1:
                continue
            goldtok = wrd_tokens[0]
            if abs(goldtok.interval.start - disc_start) > 0.03:
                continue
            if abs(goldtok.interval.end - disc_end) > 0.03:
                continue
            types_hit.add(goldtok.mark)
            hits += 1

    if n_disc_fragments == 0:
        token_prec = np.nan
    else:
        token_prec = hits / n_disc_fragments

    if n_word_tokens == 0:
        token_rec = np.nan
    else:
        token_rec = hits / n_word_tokens

    if len(types_seen) == 0:
        type_prec = np.nan
    else:
        type_prec = len(types_hit) / len(types_seen)

    if n_word_types == 0:
        type_rec = np.nan
    else:
        type_rec = len(types_hit) / n_word_types

    return token_prec, token_rec, type_prec, type_rec
Exemple #37
0
def load_disc(fname, corpus, split_file, truncate, verbose):
    with verb_print('  loading discovered classes',
                             verbose, True, True, True):
        split_mapping = load_split(split_file)
        disc, errors = _load_classes(fname, corpus, split_mapping)
        if not truncate:
            errors_found = len(errors) > 0
            if len(errors) > 100:
                print 'There were more than 100 interval errors found.'
                print 'Printing only the first 100.'
                print
                errors = errors[:100]
            for fragment in sorted(errors, key=lambda x: (x.name, x.interval.start)):
                print '  error: {0} [{1:.3f}, {2:.3f}]'.format(
                    fragment.name, fragment.interval.start, fragment.interval.end)
            if not truncate and errors_found:
                print 'There were errors in {0}. Use option -f to'\
                    ' automatically skip invalid intervals.'.format(fname)
                sys.exit()

    if truncate:
        with verb_print('  checking discovered classes and truncating'):
            disc, filename_errors, interval_errors = \
                truncate_intervals(disc, corpus,
                                   split_mapping)
    else:
        with verb_print('  checking discovered classes', verbose, True,
                                 True, True):
            filename_errors, interval_errors = \
                check_intervals(disc, split_mapping)
    if not truncate:
        filename_errors = sorted(filename_errors,
                                 key=lambda x: (x.name, x.interval.start))
        interval_errors = sorted(interval_errors,
                                 key=lambda x: (x.name, x.interval.start))
        interval_error = len(interval_errors) > 0
        filename_error = len(filename_errors) > 0
        errors_found = filename_error or interval_error
        if interval_error:
            print banner('intervals found in {0} outside of valid'
                                      ' splits'.format(fname))
            if len(interval_errors) > 100:
                print 'There were more than 100 interval errors found.'
                print 'Printing only the first 100.'
                print
                interval_errors = interval_errors[:100]
            for fragment in sorted(interval_errors,
                                   key=lambda x: (x.name, x.interval.start)):
                print '  error: {0} [{1:.3f}, {2:.3f}]'.format(
                    fragment.name,
                    fragment.interval.start, fragment.interval.end)
        if filename_error:
            print banner('unknown filenames found in {0}'
                                      .format(fname))
            if len(filename_errors) > 100:
                print 'There were more than 100 filename errors found.'
                print 'Printing only the first 100.'
                print
                filename_errors = filename_errors[:100]
            for fragment in sorted(filename_errors,
                                   key=lambda x: (x.name, x.interval.start)):
                print '  error: {0}'.format(fragment.name)
        if not truncate and errors_found:
            print 'There were errors in {0}. Use option -f to automatically skip invalid intervals.'.format(fname)
            sys.exit()
    return disc
Exemple #38
0
def load_disc(fname, corpus, split_file, truncate, verbose):
    with verb_print('  loading discovered classes', verbose, True, True, True):
        split_mapping = load_split(split_file)
        disc, errors = _load_classes(fname, corpus, split_mapping)
        if not truncate:
            errors_found = len(errors) > 0
            if len(errors) > 100:
                print 'There were more than 100 interval errors found.'
                print 'Printing only the first 100.'
                print
                errors = errors[:100]
            for fragment in sorted(errors,
                                   key=lambda x: (x.name, x.interval.start)):
                print '  error: {0} [{1:.3f}, {2:.3f}]'.format(
                    fragment.name, fragment.interval.start,
                    fragment.interval.end)
            if not truncate and errors_found:
                print 'There were errors in {0}. Use option -f to'\
                    ' automatically skip invalid intervals.'.format(fname)
                sys.exit()

    if truncate:
        with verb_print('  checking discovered classes and truncating'):
            disc, filename_errors, interval_errors = \
                truncate_intervals(disc, corpus,
                                   split_mapping)
    else:
        with verb_print('  checking discovered classes', verbose, True, True,
                        True):
            filename_errors, interval_errors = \
                check_intervals(disc, split_mapping)
    if not truncate:
        filename_errors = sorted(filename_errors,
                                 key=lambda x: (x.name, x.interval.start))
        interval_errors = sorted(interval_errors,
                                 key=lambda x: (x.name, x.interval.start))
        interval_error = len(interval_errors) > 0
        filename_error = len(filename_errors) > 0
        errors_found = filename_error or interval_error
        if interval_error:
            print banner('intervals found in {0} outside of valid'
                         ' splits'.format(fname))
            if len(interval_errors) > 100:
                print 'There were more than 100 interval errors found.'
                print 'Printing only the first 100.'
                print
                interval_errors = interval_errors[:100]
            for fragment in sorted(interval_errors,
                                   key=lambda x: (x.name, x.interval.start)):
                print '  error: {0} [{1:.3f}, {2:.3f}]'.format(
                    fragment.name, fragment.interval.start,
                    fragment.interval.end)
        if filename_error:
            print banner('unknown filenames found in {0}'.format(fname))
            if len(filename_errors) > 100:
                print 'There were more than 100 filename errors found.'
                print 'Printing only the first 100.'
                print
                filename_errors = filename_errors[:100]
            for fragment in sorted(filename_errors,
                                   key=lambda x: (x.name, x.interval.start)):
                print '  error: {0}'.format(fragment.name)
        if not truncate and errors_found:
            print 'There were errors in {0}. Use option -f to automatically skip invalid intervals.'.format(
                fname)
            sys.exit()
    return disc
Exemple #39
0
    resource_dir = args['trs'][0]

    corpus = args['trs'][1]
    
    # if corpus is "other", change resource_dir to get the transcriptions/vad

    phn_corpus_file   = path.join(resource_dir, '{}.phn'.format(corpus))
    wrd_corpus_file   = path.join(resource_dir, '{}.wrd'.format(corpus))
    vad_file          = path.join(resource_dir, '{}.vad'.format(corpus))
    print vad_file

    if verbose:
        print banner('LOADING FILES')

    # load gold phones and gold words
    with verb_print('  loading word corpus file',
                             verbose, True, True, True):
        wrd_corpus = load_corpus_txt(wrd_corpus_file)

    with verb_print('  loading phone corpus file',
                             verbose, True, True, True):
        phn_corpus = load_corpus_txt(phn_corpus_file)
    
    # load across and withing folds
    with verb_print('  loading folds cross',
                             verbose, True, True, True):
        #fragments_cross = load_split(folds_cross_file,
        #                             multiple=False)
        intervals_vad = [load_split(vad_file,
                                     multiple=False)]
    # get list of file names from vad: 
    #    names = load_names(vad_file)
Exemple #40
0
def load_gold(fname, corpus, verbose):
    with verb_print('  loading gold classes',
                             verbose, True, True, True):
        gold, _ = _load_classes(fname, corpus)
    return gold
Exemple #41
0
def load_fragments_within(fname, verbose):
    with verb_print('  loading folds within',
                             verbose, True, True, True):
        fragments = load_split(fname, multiple=True)
    return fragments
Exemple #42
0
def load_fragments_within(fname, verbose):
    with verb_print('  loading folds within', verbose, True, True, True):
        fragments = load_split(fname, multiple=True)
    return fragments
Exemple #43
0
def load_wrd_corpus(wrd_corpus_file, verbose):
    with verb_print('  loading word corpus file', verbose, True, True, True):
        wrd_corpus = _load_corpus(wrd_corpus_file)
    return wrd_corpus
Exemple #44
0
def load_phn_corpus(phn_corpus_file, verbose):
    with verb_print('  loading phone corpus file', verbose, True, True, True):
        phn_corpus = _load_corpus(phn_corpus_file)
    return phn_corpus
Exemple #45
0
def load_wrd_corpus(wrd_corpus_file, verbose):
    with verb_print('  loading word corpus file',
                             verbose, True, True, True):
        wrd_corpus = _load_corpus(wrd_corpus_file)
    return wrd_corpus
Exemple #46
0
def load_gold(fname, corpus, verbose):
    with verb_print('  loading gold classes', verbose, True, True, True):
        gold, _ = _load_classes(fname, corpus)
    return gold
Exemple #47
0
def load_phn_corpus(phn_corpus_file, verbose):
    with verb_print('  loading phone corpus file',
                             verbose, True, True, True):
        phn_corpus = _load_corpus(phn_corpus_file)
    return phn_corpus