def _boundary_sub(disc_clsdict, corpus, names, label, verbose, n_jobs, threshold=0.03): eb = eval_from_bounds if verbose: print ' boundary ({2}): subsampled {0} files in {1} sets'\ .format(sum(map(len, names)), len(names), label) with verb_print(' boundary ({0}): calculating scores'.format(label), verbose, True, True, True): disc_bounds = [ Boundaries(disc_clsdict.restrict(ns), threshold=threshold) for ns in names ] gold_bounds = [ Boundaries(corpus.restrict(ns), threshold=threshold) for ns in names ] with verb_print(' boundary ({0}): calculating scores'.format(label), verbose, False, True, False): p, r = izip(*Parallel(n_jobs=n_jobs, verbose=5 if verbose else 0, pre_dispatch='2*n_jobs') \ (delayed(eb)(disc, gold) for disc, gold in zip(disc_bounds, gold_bounds))) p, r = np.fromiter(p, dtype=np.double), np.fromiter(r, dtype=np.double) p, r = praggregate(p, r) return p, r
def _match_sub(disc_clsdict, gold_clsdict, phn_corpus, names, label, verbose, n_jobs): em = eval_from_psets if verbose: print ' matching ({2}): subsampled {0} files in {1} sets'\ .format(sum(map(len, names)), len(names), label) with verb_print(' matching ({0}): prepping psets'.format(label), verbose, True, True, True): pdiscs = [make_pdisc(disc_clsdict.restrict(fs, True), False, False) for fs in names] pgolds = [make_pgold(gold_clsdict.restrict(fs, True), False, False) for fs in names] psubs = [make_psubs(disc_clsdict.restrict(fs, True), phn_corpus, 3, 20, False, False) for fs in names] with verb_print(' matching ({0}): calculating scores' .format(label), verbose, False, True, False): tp, tr = izip(*Parallel(n_jobs=n_jobs, verbose=5 if verbose else 0, pre_dispatch='n_jobs') (delayed(em)(pdisc, pgold, psub) for pdisc, pgold, psub in zip(pdiscs, pgolds, psubs))) tp, tr = np.fromiter(tp, dtype=np.double), np.fromiter(tr, dtype=np.double) tp, tr = praggregate(tp, tr) return tp, tr
def _match_sub(disc_clsdict, gold_clsdict, phn_corpus, names, label, verbose, n_jobs): em = eval_from_psets if verbose: print ' matching ({2}): subsampled {0} files in {1} sets'\ .format(sum(map(len, names)), len(names), label) with verb_print(' matching ({0}): prepping psets'.format(label), verbose, True, True, True): pdiscs = [ make_pdisc(disc_clsdict.restrict(fs, True), False, False) for fs in names ] pgolds = [ make_pgold(gold_clsdict.restrict(fs, True), False, False) for fs in names ] psubs = [ make_psubs(disc_clsdict.restrict(fs, True), phn_corpus, 3, 20, False, False) for fs in names ] with verb_print(' matching ({0}): calculating scores'.format(label), verbose, False, True, False): tp, tr = izip(*Parallel( n_jobs=n_jobs, verbose=5 if verbose else 0, pre_dispatch='n_jobs')( delayed(em)(pdisc, pgold, psub) for pdisc, pgold, psub in zip(pdiscs, pgolds, psubs))) tp, tr = np.fromiter(tp, dtype=np.double), np.fromiter(tr, dtype=np.double) tp, tr = praggregate(tp, tr) return tp, tr
def _nlp_sub(disc_clsdict, gold_clsdict, names, label, verbose, n_jobs): # ned ned = NED cov = coverage if verbose: print ' nlp ({2}): subsampled {0} files in {1} sets'\ .format(sum(map(len, names)), len(names), label) with verb_print(' nlp ({0}): calculating scores'.format(label), verbose, False, True, False): ned_score = Parallel(n_jobs=n_jobs, verbose=5 if verbose else 0, pre_dispatch='n_jobs')(delayed(ned)\ (disc_clsdict.restrict(ns, True)) for ns in names) cov_score = Parallel(n_jobs=n_jobs, verbose=5 if verbose else 0, pre_dispatch='n_jobs')(delayed(cov)\ (disc_clsdict.restrict(ns, False), gold_clsdict.restrict(ns, False)) for ns in names) # don't replace nan's by 1, but ignore them, unless all values in ned_score # are nan ned_score, cov_score = np.array(ned_score), np.array(cov_score) ned_score, cov_score = aggregate(ned_score, 1), aggregate(cov_score) return np.array(ned_score), np.array(cov_score)
def _nlp_sub(disc_clsdict, gold_clsdict, names, label, verbose, n_jobs): # ned ned = NED cov = coverage if verbose: print ' nlp ({2}): subsampled {0} files in {1} sets'\ .format(sum(map(len, names)), len(names), label) with verb_print(' nlp ({0}): calculating scores' .format(label), verbose, False, True, False): ned_score = Parallel(n_jobs=n_jobs, verbose=5 if verbose else 0, pre_dispatch='n_jobs')(delayed(ned)\ (disc_clsdict.restrict(ns, True)) for ns in names) cov_score = Parallel(n_jobs=n_jobs, verbose=5 if verbose else 0, pre_dispatch='n_jobs')(delayed(cov)\ (disc_clsdict.restrict(ns, False), gold_clsdict.restrict(ns, False)) for ns in names) # don't replace nan's by 1, but ignore them, unless all values in ned_score # are nan ned_score, cov_score = np.array(ned_score), np.array(cov_score) ned_score, cov_score = aggregate(ned_score, 1), aggregate(cov_score) return np.array(ned_score), np.array(cov_score)
def make_psubs_nmatch(psubs, verbose, debug): with verb_print('making psubs nmatch', verbose, True, True): psubs_nmatch = nmatch(psubs) if debug: print banner('NMATCH(PSUBS)') print pformat(psubs_nmatch) print return psubs_nmatch
def make_typeset(psubs, verbose, debug): with verb_print('making typeset', verbose, True, True): ts = list(typeset(psubs)) if debug: print banner('TYPES(PSUBS) ({0})'.format(len(ts))) print pformat(ts) print return ts
def make_pdisc(disc_clsdict, verbose, debug): with verb_print('constructing pdisc set', verbose, True, True): pdisc = list(Pclus(disc_clsdict)) if debug: print banner('PDISC ({0})'.format(len(pdisc))) print pretty_pairs(pdisc) print return pdisc
def make_pgold(gold_clsdict, verbose, debug): with verb_print('constructing pgold set', verbose, True, True): pgold = list(Pclus(gold_clsdict)) if debug: print banner('PGOLD ({0})'.format(len(pgold))) print pretty_pairs(pgold) print return pgold
def make_pgoldclus_nmatch(pgoldclus, verbose, debug): with verb_print('constructing pgoldclus_nmatch', verbose, True, True): pgoldclus_nmatch = nmatch(pgoldclus) if debug: print banner('NMATCH(PGOLDCLUS)') print pformat(pgoldclus_nmatch) print return pgoldclus_nmatch
def make_weights(pclus, verbose, debug): with verb_print('constructing weights', verbose, True, True): ws = weights(pclus) if debug: print banner('WEIGHTS') print pformat(ws) print return ws
def make_typeset(pclus, verbose, debug): with verb_print('constructing typeset', verbose, True, True): ts = list(typeset(pclus)) if debug: print banner('TYPESET ({0})'.format(len(ts))) print pformat(ts) print return ts
def make_weights(psubs, verbose, debug): with verb_print('making weights', verbose, True, True): ws = weights(psubs) if debug: print banner('WEIGHTS(PSUBS) ({0})'.format(len(ws))) print pformat(ws) print return ws
def make_psubs_pgold_nmatch(pgold, psubs, verbose, debug): with verb_print('making psubs/pgold nmatch', verbose, True, True): psubs_pgold_intersect = intersection(pgold, psubs) psubs_pgold_nmatch = nmatch(psubs_pgold_intersect) if debug: print banner('NMATCH(PSUBS/PGOLD)') print pformat(psubs_pgold_nmatch) print return psubs_pgold_nmatch
def make_pgoldclus(disc_clsdict, verbose, debug): with verb_print('constructing pgoldclus', verbose, True, True): pgoldclus = list(Pgoldclus(disc_clsdict)) if debug: pgoldclus = list(pgoldclus) print banner('PGOLDCLUS ({0})'.format(len(pgoldclus))) print pretty_pairs(pgoldclus) print return pgoldclus
def make_pgold_nmatch(pgold, verbose, debug): with verb_print('constructing nmatch_gold', verbose, True, True): nmatch_gold = nmatch(pgold) if debug: print banner('nmatch_gold') for k, v in nmatch_gold.iteritems(): print k, v return nmatch_gold
def make_pclus_pgoldclus_nmatch(pclus, pgoldclus, verbose, debug): with verb_print('making pclus/pgoldclus nmatch', verbose, True, True): pclus_pgoldclus_intersect = list(intersection(pclus, pgoldclus)) pclus_pgoldclus_nmatch = nmatch(pclus_pgoldclus_intersect) if debug: print banner('NMATCH(PCLUS/PGOLDCLUS)') print pformat(pclus_pgoldclus_nmatch) print return pclus_pgoldclus_nmatch
def make_psubs(disc_clsdict, corpus, minlength, maxlength, verbose, debug): with verb_print('constructing psubs set', verbose, True, True): psubs = list(Psubs(disc_clsdict, corpus, minlength=minlength, maxlength=maxlength)) if debug: print banner('PSUBS ({0})'.format(len(psubs))) print pretty_pairs(psubs) print return psubs
def make_pclus(disc_clsdict, verbose, debug): with verb_print('constructing pclus', verbose, True, True): pclus = list(tuple(sorted((f1, f2), key=lambda f: (f.name, f.interval.start))) for f1, f2 in Pclus_single(disc_clsdict)) if debug: print banner('PCLUS ({0})'.format(len(pclus))) print pretty_pairs(pclus) print return pclus
def _boundary_sub(disc_clsdict, corpus, names, label, verbose, n_jobs): eb = eval_from_bounds if verbose: print ' boundary ({2}): subsampled {0} files in {1} sets'\ .format(sum(map(len, names)), len(names), label) with verb_print(' boundary ({0}): calculating scores' .format(label), verbose, True, True, True): disc_bounds = [Boundaries(disc_clsdict.restrict(ns)) for ns in names] gold_bounds = [Boundaries(corpus.restrict(ns)) for ns in names] with verb_print(' boundary ({0}): calculating scores' .format(label), verbose, False, True, False): p, r = izip(*Parallel(n_jobs=n_jobs, verbose=5 if verbose else 0, pre_dispatch='2*n_jobs') \ (delayed(eb)(disc, gold) for disc, gold in zip(disc_bounds, gold_bounds))) p, r = np.fromiter(p, dtype=np.double), np.fromiter(r, dtype=np.double) p, r = praggregate(p, r) return p, r
def make_psubs(disc_clsdict, corpus, minlength, maxlength, verbose, debug): with verb_print('constructing psubs set', verbose, True, True): psubs = list( Psubs(disc_clsdict, corpus, minlength=minlength, maxlength=maxlength)) if debug: print banner('PSUBS ({0})'.format(len(psubs))) print pretty_pairs(psubs) print return psubs
def _group_sub(disc_clsdict, names, label, verbose, n_jobs): eg = evaluate_group if verbose: print ' group ({2}): subsampled {0} files in {1} sets'\ .format(sum(map(len, names)), len(names), label) with verb_print(' group ({0}): calculating scores'.format(label), verbose, False, True, False): p, r = izip(*(Parallel( n_jobs=n_jobs, verbose=5 if verbose else 0, pre_dispatch='n_jobs')( delayed(eg)(disc_clsdict.restrict(ns, True)) for ns in names))) p, r = np.fromiter(p, dtype=np.double), np.fromiter(r, dtype=np.double) p, r = praggregate(p, r) return p, r
def evaluate_token_type(disc_clsdict, wrd_corpus, verbose=False, threshold=0.03, debug=False): n_word_tokens = iterator_length(unique(wrd_corpus.iter_fragments())) word_types = set(f.mark for f in wrd_corpus.iter_fragments()) n_word_types = len(word_types) n_disc_fragments = iterator_length(disc_clsdict.iter_fragments()) with verb_print('querying words', verbose, True, True, True): types_hit = set() types_seen = set() hits = 0 for disc_fragment in disc_clsdict.iter_fragments(): disc_start = disc_fragment.interval.start disc_end = disc_fragment.interval.end wrd_tokens = wrd_corpus.tokens(disc_fragment.name, disc_fragment.interval) types_seen.add(tuple(f.mark for f in wrd_tokens)) if len(wrd_tokens) != 1: continue goldtok = wrd_tokens[0] if abs(goldtok.interval.start - disc_start) > threshold: continue if abs(goldtok.interval.end - disc_end) > threshold: continue types_hit.add(goldtok.mark) hits += 1 if n_disc_fragments == 0: token_prec = np.nan else: token_prec = hits / n_disc_fragments if n_word_tokens == 0: token_rec = np.nan else: token_rec = hits / n_word_tokens if len(types_seen) == 0: type_prec = np.nan else: type_prec = len(types_hit) / len(types_seen) if n_word_types == 0: type_rec = np.nan else: type_rec = len(types_hit) / n_word_types return token_prec, token_rec, type_prec, type_rec
def _boundary_sub(disc_clsdict, corpus, names, label, verbose, n_jobs): eb = eval_from_bounds if verbose: print ' boundary ({2}): subsampled {0} files in {1} sets'\ .format(sum(map(len, names)), len(names), label) with verb_print(' boundary ({0}): calculating scores'.format(label), verbose, True, True, True): disc_bounds = [Boundaries(disc_clsdict.restrict(ns)) for ns in names] gold_bounds = [Boundaries(corpus.restrict(ns)) for ns in names] #print(len(disc_bounds[0].bounds)) acc = 0 '''for element in disc_bounds: for key in element.bounds.keys(): acc += len(element.bounds[key]) print(acc, len(element.bounds))''' acc = 0 '''for element in gold_bounds: for key in element.bounds.keys(): acc += len(element.bounds[key]) print(acc, len(element.bounds)) acc = 0''' #print(element.bounds.keys()[:10], len(element.bounds)) #print(element.bounds[element.bounds.keys()[0]]) with verb_print(' boundary ({0}): calculating scores'.format(label), verbose, False, True, False): p, r = izip(*Parallel(n_jobs=n_jobs, verbose=5 if verbose else 0, pre_dispatch='2*n_jobs') \ (delayed(eb)(disc, gold) for disc, gold in zip(disc_bounds, gold_bounds))) p, r = np.fromiter(p, dtype=np.double), np.fromiter(r, dtype=np.double) p, r = praggregate(p, r) return p, r
def _token_type_sub(clsdict, wrd_corpus, names, label, verbose, n_jobs): et = evaluate_token_type if verbose: print ' token/type ({2}): subsampled {0} files in {1} sets'\ .format(sum(map(len, names)), len(names), label) with verb_print(' token/type ({0}): calculating scores' .format(label), verbose, False, True, False): pto, rto, pty, rty = izip(*(et(clsdict.restrict(ns, False), wrd_corpus.restrict(ns)) for ns in names)) pto, rto, pty, rty = np.array(pto), np.array(rto), np.array(pty), np.array(rty) pto, rto = praggregate(pto, rto) pty, rty = praggregate(pty, rty) return pto, rto, pty, rty
def _group_sub(disc_clsdict, names, label, verbose, n_jobs): eg = evaluate_group if verbose: print ' group ({2}): subsampled {0} files in {1} sets'\ .format(sum(map(len, names)), len(names), label) with verb_print(' group ({0}): calculating scores'.format(label), verbose, False, True, False): p, r = izip(*(Parallel(n_jobs=n_jobs, verbose=5 if verbose else 0, pre_dispatch='n_jobs') (delayed(eg)(disc_clsdict.restrict(ns, True)) for ns in names))) p, r = np.fromiter(p, dtype=np.double), np.fromiter(r, dtype=np.double) p, r = praggregate(p, r) return p, r
def ned_sub(matches, verbose, n_jobs): # ned with verb_print(' ned: calculating scores', verbose, False, True, False): ned_scores = Parallel(n_jobs=n_jobs, verbose=5 if verbose else 0, pre_dispatch='n_jobs')(delayed(NED) (match) for match in matches) dtw_scores = [match.dtw for match in matches] scores = zip(ned_scores, dtw_scores) print(len(scores)) scores = filter(lambda x: x[0] != np.nan, scores) print(len(scores)) return zip(*scores)
def evaluate_token_type(disc_clsdict, wrd_corpus, verbose=False, debug=False): n_word_tokens = iterator_length(unique(wrd_corpus.iter_fragments())) word_types = set(f.mark for f in wrd_corpus.iter_fragments()) n_word_types = len(word_types) n_disc_fragments = iterator_length(disc_clsdict.iter_fragments()) with verb_print('querying words', verbose, True, True, True): types_hit = set() types_seen = set() hits = 0 for disc_fragment in disc_clsdict.iter_fragments(): disc_start = disc_fragment.interval.start disc_end = disc_fragment.interval.end wrd_tokens = wrd_corpus.tokens(disc_fragment.name, disc_fragment.interval) types_seen.add(tuple(f.mark for f in wrd_tokens)) if len(wrd_tokens) != 1: continue goldtok = wrd_tokens[0] if abs(goldtok.interval.start - disc_start) > 0.03: continue if abs(goldtok.interval.end - disc_end) > 0.03: continue types_hit.add(goldtok.mark) hits += 1 if n_disc_fragments == 0: token_prec = np.nan else: token_prec = hits / n_disc_fragments if n_word_tokens == 0: token_rec = np.nan else: token_rec = hits / n_word_tokens if len(types_seen) == 0: type_prec = np.nan else: type_prec = len(types_hit) / len(types_seen) if n_word_types == 0: type_rec = np.nan else: type_rec = len(types_hit) / n_word_types return token_prec, token_rec, type_prec, type_rec
def load_disc(fname, corpus, split_file, truncate, verbose): with verb_print(' loading discovered classes', verbose, True, True, True): split_mapping = load_split(split_file) disc, errors = _load_classes(fname, corpus, split_mapping) if not truncate: errors_found = len(errors) > 0 if len(errors) > 100: print 'There were more than 100 interval errors found.' print 'Printing only the first 100.' print errors = errors[:100] for fragment in sorted(errors, key=lambda x: (x.name, x.interval.start)): print ' error: {0} [{1:.3f}, {2:.3f}]'.format( fragment.name, fragment.interval.start, fragment.interval.end) if not truncate and errors_found: print 'There were errors in {0}. Use option -f to'\ ' automatically skip invalid intervals.'.format(fname) sys.exit() if truncate: with verb_print(' checking discovered classes and truncating'): disc, filename_errors, interval_errors = \ truncate_intervals(disc, corpus, split_mapping) else: with verb_print(' checking discovered classes', verbose, True, True, True): filename_errors, interval_errors = \ check_intervals(disc, split_mapping) if not truncate: filename_errors = sorted(filename_errors, key=lambda x: (x.name, x.interval.start)) interval_errors = sorted(interval_errors, key=lambda x: (x.name, x.interval.start)) interval_error = len(interval_errors) > 0 filename_error = len(filename_errors) > 0 errors_found = filename_error or interval_error if interval_error: print banner('intervals found in {0} outside of valid' ' splits'.format(fname)) if len(interval_errors) > 100: print 'There were more than 100 interval errors found.' print 'Printing only the first 100.' print interval_errors = interval_errors[:100] for fragment in sorted(interval_errors, key=lambda x: (x.name, x.interval.start)): print ' error: {0} [{1:.3f}, {2:.3f}]'.format( fragment.name, fragment.interval.start, fragment.interval.end) if filename_error: print banner('unknown filenames found in {0}' .format(fname)) if len(filename_errors) > 100: print 'There were more than 100 filename errors found.' print 'Printing only the first 100.' print filename_errors = filename_errors[:100] for fragment in sorted(filename_errors, key=lambda x: (x.name, x.interval.start)): print ' error: {0}'.format(fragment.name) if not truncate and errors_found: print 'There were errors in {0}. Use option -f to automatically skip invalid intervals.'.format(fname) sys.exit() return disc
def load_disc(fname, corpus, split_file, truncate, verbose): with verb_print(' loading discovered classes', verbose, True, True, True): split_mapping = load_split(split_file) disc, errors = _load_classes(fname, corpus, split_mapping) if not truncate: errors_found = len(errors) > 0 if len(errors) > 100: print 'There were more than 100 interval errors found.' print 'Printing only the first 100.' print errors = errors[:100] for fragment in sorted(errors, key=lambda x: (x.name, x.interval.start)): print ' error: {0} [{1:.3f}, {2:.3f}]'.format( fragment.name, fragment.interval.start, fragment.interval.end) if not truncate and errors_found: print 'There were errors in {0}. Use option -f to'\ ' automatically skip invalid intervals.'.format(fname) sys.exit() if truncate: with verb_print(' checking discovered classes and truncating'): disc, filename_errors, interval_errors = \ truncate_intervals(disc, corpus, split_mapping) else: with verb_print(' checking discovered classes', verbose, True, True, True): filename_errors, interval_errors = \ check_intervals(disc, split_mapping) if not truncate: filename_errors = sorted(filename_errors, key=lambda x: (x.name, x.interval.start)) interval_errors = sorted(interval_errors, key=lambda x: (x.name, x.interval.start)) interval_error = len(interval_errors) > 0 filename_error = len(filename_errors) > 0 errors_found = filename_error or interval_error if interval_error: print banner('intervals found in {0} outside of valid' ' splits'.format(fname)) if len(interval_errors) > 100: print 'There were more than 100 interval errors found.' print 'Printing only the first 100.' print interval_errors = interval_errors[:100] for fragment in sorted(interval_errors, key=lambda x: (x.name, x.interval.start)): print ' error: {0} [{1:.3f}, {2:.3f}]'.format( fragment.name, fragment.interval.start, fragment.interval.end) if filename_error: print banner('unknown filenames found in {0}'.format(fname)) if len(filename_errors) > 100: print 'There were more than 100 filename errors found.' print 'Printing only the first 100.' print filename_errors = filename_errors[:100] for fragment in sorted(filename_errors, key=lambda x: (x.name, x.interval.start)): print ' error: {0}'.format(fragment.name) if not truncate and errors_found: print 'There were errors in {0}. Use option -f to automatically skip invalid intervals.'.format( fname) sys.exit() return disc
resource_dir = args['trs'][0] corpus = args['trs'][1] # if corpus is "other", change resource_dir to get the transcriptions/vad phn_corpus_file = path.join(resource_dir, '{}.phn'.format(corpus)) wrd_corpus_file = path.join(resource_dir, '{}.wrd'.format(corpus)) vad_file = path.join(resource_dir, '{}.vad'.format(corpus)) print vad_file if verbose: print banner('LOADING FILES') # load gold phones and gold words with verb_print(' loading word corpus file', verbose, True, True, True): wrd_corpus = load_corpus_txt(wrd_corpus_file) with verb_print(' loading phone corpus file', verbose, True, True, True): phn_corpus = load_corpus_txt(phn_corpus_file) # load across and withing folds with verb_print(' loading folds cross', verbose, True, True, True): #fragments_cross = load_split(folds_cross_file, # multiple=False) intervals_vad = [load_split(vad_file, multiple=False)] # get list of file names from vad: # names = load_names(vad_file)
def load_gold(fname, corpus, verbose): with verb_print(' loading gold classes', verbose, True, True, True): gold, _ = _load_classes(fname, corpus) return gold
def load_fragments_within(fname, verbose): with verb_print(' loading folds within', verbose, True, True, True): fragments = load_split(fname, multiple=True) return fragments
def load_wrd_corpus(wrd_corpus_file, verbose): with verb_print(' loading word corpus file', verbose, True, True, True): wrd_corpus = _load_corpus(wrd_corpus_file) return wrd_corpus
def load_phn_corpus(phn_corpus_file, verbose): with verb_print(' loading phone corpus file', verbose, True, True, True): phn_corpus = _load_corpus(phn_corpus_file) return phn_corpus