Exemple #1
0
def get_embs_labels(k, allembs, allcnts, reverse_dict, plot_only):
	embs_list = []
	labels_list = []
	norm_cnts, mask = filter_words(allcnts, 0.25)
	is_untrained_embs = np.count_nonzero(norm_cnts) == 0

	for i in range(k):
		embs = []
		labels = []

		for j in plot_only:
			if(mask[j][i] or is_untrained_embs):
				embs.append(allembs[i][j, :])

		embs_list.append(embs)

		if k == 1:
			for j in plot_only:
				if(mask[j][i] or is_untrained_embs):
					labels.append(
						reverse_dict[j]+'_{}'.format(int(allcnts[j][i])))
		else:
			for j in plot_only:
				if(mask[j][i] or is_untrained_embs):
					labels.append(reverse_dict[j]+'_'+str(i) \
						+ '_{}'.format(int(allcnts[j][i])))

		labels_list += labels

	return embs_list, labels_list
Exemple #2
0
def main():
    parser = get_parser()
    args = parser.parse_args()

    if args.doc:
        print __doc__
        sys.exit()

    g = geosearchclass.GeoSearchClass()

    if args.params:
        print 'Using parameters from ' + str(args.params)
        # turn parameter file into dictionary
        g.set_params_from_file(args.params)
        
    if args.address:
        print "Finding geocoordates for address:\n{}".format(args.address)
        coords = geo_converter.get_geocoords_from_address(args.address)
        if coords:
            g.latitude = coords[0]
            print "Found this latitude:"
            print g.latitude
            g.longitude = coords[1]
            print "Found this longitude:"
            print g.longitude
        else:
            print "Failed to find coordinates. Exiting."
            sys.exit()

    if args.input:
        text = utils.load_file(args.input)
        tokens = utils.tokenize_normal_words(text)
        for_poem = utils.filter_words(tokens)
    else:
        for_poem = get_default_words()

    if args.markov:
        if args.input:
            raise StandardError("Can only input a single text file. \
use --markov <your_text_file.txt>")
        else:
            text = utils.load_file(args.markov)
            # ngram = ngrams.make_ngram(text, 2)
            ngram = ngrams.make_bigram_trigram_dictionary(text)
            formatted_poem = create_poem(g, for_poem, ngram)
    else:
        formatted_poem = create_poem(g, for_poem)

    if args.output:
        print '\nwriting formatted poem to ' + str(args.output)
        output_file = args.output
    else:
        print "\nwriting formatted poem to poem.txt"
        output_file = "poem.txt"

    utils.save_file(output_file, formatted_poem)
Exemple #3
0
async def inline_handler(inline_query: types.InlineQuery):
    text = inline_query.query.lower()
    if not text or inline_query.from_user.id not in VIP and (await amt_donated(inline_query.from_user.id)) < 10:
        await inline_query.answer(
            [
                types.InlineQueryResultArticle(
                    id=str(uuid4()),
                    title="Start a classic game",
                    description="/startclassic@on9wordchainbot",
                    input_message_content=types.InputTextMessageContent("/startclassic@on9wordchainbot"),
                ),
                types.InlineQueryResultArticle(
                    id=str(uuid4()),
                    title="Start a hard mode game",
                    description="/starthard@on9wordchainbot",
                    input_message_content=types.InputTextMessageContent("/starthard@on9wordchainbot"),
                ),
                types.InlineQueryResultArticle(
                    id=str(uuid4()),
                    title="Start a chaos game",
                    description="/startchaos@on9wordchainbot",
                    input_message_content=types.InputTextMessageContent("/startchaos@on9wordchainbot"),
                ),
                types.InlineQueryResultArticle(
                    id=str(uuid4()),
                    title="Start a chosen first letter game",
                    description="/startcfl@on9wordchainbot",
                    input_message_content=types.InputTextMessageContent("/startcfl@on9wordchainbot"),
                ),
                types.InlineQueryResultArticle(
                    id=str(uuid4()),
                    title="Start a banned letters game",
                    description="/startbl@on9wordchainbot",
                    input_message_content=types.InputTextMessageContent("/startbl@on9wordchainbot"),
                ),
                types.InlineQueryResultArticle(
                    id=str(uuid4()),
                    title="Start a required letter game",
                    description="/startrl@on9wordchainbot",
                    input_message_content=types.InputTextMessageContent("/startrl@on9wordchainbot"),
                ),
                types.InlineQueryResultArticle(
                    id=str(uuid4()),
                    title="Start an elimination game",
                    description="/startelim@on9wordchainbot",
                    input_message_content=types.InputTextMessageContent("/startelim@on9wordchainbot"),
                ),
            ],
            is_personal=not text,
        )
        return

    if any(c not in ascii_lowercase for c in text):
        await inline_query.answer(
            [
                types.InlineQueryResultArticle(
                    id=str(uuid4()),
                    title="A query can only consist of alphabets",
                    description="Try a different query",
                    input_message_content=types.InputTextMessageContent(r"¯\\_(ツ)\_/¯"),
                )
            ],
            is_personal=True,
        )
        return

    res = []
    for i in filter_words(starting_letter=text[0]):
        if i.startswith(text):
            i = i.capitalize()
            res.append(
                types.InlineQueryResultArticle(
                    id=str(uuid4()),
                    title=i,
                    input_message_content=types.InputTextMessageContent(i),
                )
            )
            if len(res) == 50:  # Max 50 results
                break
    if not res:  # No results
        res.append(
            types.InlineQueryResultArticle(
                id=str(uuid4()),
                title="No results found",
                description="Try a different query",
                input_message_content=types.InputTextMessageContent(r"¯\\_(ツ)\_/¯"),
            )
        )
    await inline_query.answer(res, is_personal=True)
def train(config, sample_validation_batches):
    source_language = config.get('src_language')
    target_language = config.get('trg_language')
    EOS_token = config.get('EOS_token')
    PAD_token = config.get('PAD_token')
    SOS_token = config.get('SOS_token')
    train_iter = config.get('train_iter')
    val_iter = config.get('val_iter')
    writer_path = config.get('writer_path')
    writer_train_path = get_or_create_dir(writer_path, 'train')
    writer_val_path = get_or_create_dir(writer_path, 'val')
    writer_train = SummaryWriter(log_dir=writer_train_path)
    writer_val = SummaryWriter(log_dir=writer_val_path)
    epochs = config.get('epochs')
    training = config.get('training')
    eval_every = training.get('eval_every')
    sample_every = training.get('sample_every')
    use_attention = config.get('use_attention')
    step = 1
    for epoch in range(epochs):
        print(f'Epoch: {epoch+1}/{epochs}')
        save_weights(config)
        for i, training_batch in enumerate(train_iter):
            loss = train_batch(config, training_batch)
            writer_train.add_scalar('loss', loss, step)

            if step == 1 or step % eval_every == 0:
                val_lengths = 0
                val_losses = 0
                reference_corpus = []
                translation_corpus = []
                for val_batch in val_iter:
                    val_loss, translations = evaluate_batch(config, val_batch)
                    val_lengths += 1
                    val_losses += val_loss
                    val_batch_trg, _ = val_batch.trg
                    _, batch_size = val_batch_trg.shape
                    references = map(
                        lambda i: torch2words(target_language,
                                              val_batch_trg[:, i]),
                        range(batch_size))
                    references = map(
                        lambda words: [
                            list(
                                filter_words(words, SOS_token, EOS_token,
                                             PAD_token))
                        ], references)
                    reference_corpus.extend(references)
                    translations = map(
                        lambda translation: list2words(
                            target_language, translation), translations)
                    translations = map(
                        lambda words: list(
                            filter_words(words, SOS_token, EOS_token, PAD_token
                                         )), translations)
                    translation_corpus.extend(translations)
                bleu = compute_bleu(reference_corpus, translation_corpus)
                val_loss = val_losses / val_lengths
                writer_val.add_scalar('bleu', bleu, step)
                writer_val.add_scalar('loss', val_loss, step)

            if step % sample_every == 0:
                val_batch = sample_validation_batches(1)
                val_batch_src, val_lengths_src = val_batch.src
                val_batch_trg, _ = val_batch.trg
                s0 = val_lengths_src[0].item()
                _, translations, attention_weights = evaluate_batch(
                    config, val_batch, True)
                source_words = torch2words(source_language, val_batch_src[:,
                                                                          0])
                target_words = torch2words(target_language, val_batch_trg[:,
                                                                          0])
                translation_words = list(
                    filter(lambda word: word != PAD_token,
                           list2words(target_language, translations[0])))
                if use_attention and sum(attention_weights.shape) != 0:
                    attention_figure = visualize_attention(
                        source_words[:s0], translation_words,
                        with_cpu(attention_weights))
                    writer_val.add_figure('attention', attention_figure, step)
                text = get_text(source_words, target_words, translation_words,
                                SOS_token, EOS_token, PAD_token)
                writer_val.add_text('translation', text, step)

            step += 1

    save_weights(config)
def main():
    args = check_argv()
    feat_type = "mfcc"

    list_dir = path.join("lists", args.language)
    if not path.isdir(list_dir):
        os.makedirs(list_dir)
    feat_dir = path.join(feat_type, args.language)
    if not path.isdir(feat_dir):
        os.makedirs(feat_dir)

    # All ground truth word segments with pronunciations
    for subset in ["dev"]:  #, "eval", "train"]:

        list_fn = path.join(list_dir, subset + ".all_gt_words.list")
        pronunciations_fn = path.join(list_dir, subset + ".prons")

        # Read forced alignments and obtain pronunciations
        word_fa_fn = path.join(gp_alignments_dir, args.language,
                               subset + ".ctm")
        phone_fa_fn = path.join(
            # gp_alignments_dir, args.language, subset + ".phone.ctm"
            gp_alignments_dir,
            args.language,
            subset + ".phone.ipa.ctm")
        if not path.isfile(phone_fa_fn):
            print("Warning: IPA pronunciations not found")
            phone_fa_fn = path.join(gp_alignments_dir, args.language,
                                    subset + ".phone.ctm")
        pronunciations_dict = pronunciations_from_fa(word_fa_fn, phone_fa_fn)

        # Write pronunciation list
        if not path.isfile(pronunciations_fn):
            print("Writing:", pronunciations_fn)
            with codecs.open(pronunciations_fn, "w", "utf-8") as f:
                for segment_key in sorted(pronunciations_dict):
                    f.write(segment_key + " " +
                            ",".join(pronunciations_dict[segment_key]) + "\n")
        else:
            print("Using existing file:", pronunciations_fn)

        # Write word list
        if not path.isfile(list_fn):
            print("Writing:", list_fn)
            with codecs.open(list_fn, "w", "utf-8") as f:
                for segment_key in sorted(pronunciations_dict):
                    f.write(segment_key + "\n")
        else:
            print("Using existing file:", list_fn)

        # Write individual phone list
        phone_list_fn = path.join(list_dir, subset + ".phone.list")
        if not path.isfile(phone_list_fn):
            utils.filter_words(phone_fa_fn,
                               phone_list_fn,
                               min_frames=5,
                               min_chars=0)
        else:
            print("Using existing file:", phone_list_fn)

        # Filter phones
        print("Reading:", phone_list_fn)
        phone_segment_keys = []
        with codecs.open(phone_list_fn, "r", "utf-8") as f:
            for line in f:
                phone_segment_keys.append(line.strip())
        phone_filtered_keys = filter_segment_keys(phone_segment_keys,
                                                  n_max_tokens=5000)
        phone_filtered_list_fn = path.join(list_dir,
                                           subset + ".filter1_phone.list")
        print("Writing:", phone_filtered_list_fn)
        if not path.isfile(phone_filtered_list_fn):
            with codecs.open(phone_filtered_list_fn, "w", "utf-8") as f:
                for segment_key in sorted(phone_filtered_keys):
                    f.write(segment_key + "\n")
        else:
            print("Using existing file:", phone_filtered_list_fn)

        # Extract phone segments from the MFCC NumPy archives
        input_npz_fn = path.join("..", "features", feat_type, args.language,
                                 args.language.lower() + "." + subset + ".npz")
        output_npz_fn = path.join(
            feat_dir,
            args.language.lower() + "." + subset + ".filter1_phone.npz")
        if not path.isfile(output_npz_fn):
            utils.segments_from_npz(input_npz_fn, phone_filtered_list_fn,
                                    output_npz_fn)
        else:
            print("Using existing file:", output_npz_fn)

        if args.analyse:
            import matplotlib.pyplot as plt
            import numpy as np

            # Most common words
            labels = [i.split("_")[0] for i in pronunciations_dict]
            counter = Counter(labels)
            print("No. word types:", len(counter))
            print("No. word tokens:", len(labels))
            print("Most common words:", counter.most_common(10))

            # Histogram of word count
            counts = counter.values()
            plt.figure()
            plt.hist(counts, 50)
            plt.yscale("log")
            plt.ylabel("No. of types with this many tokens")
            plt.xlabel("No. of tokens")

            # # Temp
            # # Most common words
            # labels = [i.split("_")[0] for i in filtered_keys]
            # counter = Counter(labels)
            # print("No. word types:", len(counter))
            # print("No. word tokens:", len(labels))
            # print("Most common words:", counter.most_common(10))

            # # Histogram of word count
            # counts = counter.values()
            # plt.figure()
            # plt.hist(counts, 50)
            # plt.yscale("log")
            # plt.ylabel("No. of types with this many tokens")
            # plt.xlabel("No. of tokens")

            plt.show()

        # Filter 1
        print("Applying filter 1")
        n_min_tokens_per_type = 10
        n_max_tokens_per_type = 25
        filtered_keys = filter_segment_keys(list(pronunciations_dict),
                                            n_min_tokens_per_type,
                                            n_max_tokens_per_type)
        print("No. tokens:", len(filtered_keys))
        print("No. types:", len(set([i.split("_")[0] for i in filtered_keys])))
        filtered_list_fn = path.join(list_dir, subset + ".filter1_gt.list")
        print("Writing:", filtered_list_fn)
        if not path.isfile(filtered_list_fn):
            with codecs.open(filtered_list_fn, "w", "utf-8") as f:
                for segment_key in sorted(filtered_keys):
                    f.write(segment_key + "\n")
        else:
            print("Using existing file:", filtered_list_fn)

        # Extract word segments from the MFCC NumPy archives
        input_npz_fn = path.join("..", "features", feat_type, args.language,
                                 args.language.lower() + "." + subset + ".npz")
        output_npz_fn = path.join(
            feat_dir,
            args.language.lower() + "." + subset + ".filter1_gt.npz")
        if not path.isfile(output_npz_fn):
            utils.segments_from_npz(input_npz_fn, filtered_list_fn,
                                    output_npz_fn)
        else:
            print("Using existing file:", output_npz_fn)
Exemple #6
0
import utils
import numpy as np
from gensim.models import Word2Vec
import warnings

warnings.filterwarnings("ignore")

# 加载Word2Vec模型
model = Word2Vec.load(model_file)

# 读取数据
df_all = pd.read_csv(processed_data_file, encoding='utf-8', nrows=train_num)

contents = list(df_all['Query_List'].values)
words = [[word for word in content.split(' ')] for content in contents]
words_list = utils.filter_words(words)

print('开始构造word2vec特征......')
w2v_feat = np.zeros((len(words_list), w2v_dim))  # w2v_dim = 300  # 词向量的维度
w2v_feat_avg = np.zeros((len(words_list), w2v_dim))

i = 0
for words in words_list:
    num = 0
    for word in words:
        vec = model[word]
        w2v_feat[i, :] += vec  # 属于同一文档的词向量直接相加
        num += 1

    w2v_feat_avg[i, :] = w2v_feat[i, :] / num  # 属于同一个文档的词向量加权平均
    i += 1
def main():
    args = check_argv()
    feat_type = "mfcc"

    # RAW FEATURES

    # Extract MFCCs for the different sets
    feat_dir = path.join(feat_type, args.language)
    if not path.isdir(feat_dir):
        os.makedirs(feat_dir)
    for subset in ["dev", "eval", "train"]:
        raw_feat_fn = path.join(feat_dir,
                                args.language.lower() + "." + subset + ".npz")
        if not path.isfile(raw_feat_fn):
            print("Extracting MFCCs:", subset)
            extract_features_for_subset(args.language, subset, feat_type,
                                        raw_feat_fn)
        else:
            print("Using existing file:", raw_feat_fn)

    # assert False

    # GROUND TRUTH WORD SEGMENTS

    list_dir = path.join("lists", args.language)
    if not path.isdir(list_dir):
        os.makedirs(list_dir)
    for subset in ["dev", "eval", "train"]:

        # Create a ground truth word list (at least 50 frames and 5 characters)
        fa_fn = path.join(gp_alignments_dir, args.language, subset + ".ctm")
        list_fn = path.join(list_dir, subset + ".gt_words.list")
        if not path.isfile(list_fn):
            if args.language == "KO":
                min_frames = 26
                min_chars = 3
            elif args.language == "TH":
                min_frames = 38
                min_chars = 2
            elif args.language == "VN":
                min_frames = 30
                min_chars = 4
            else:
                min_frames = 50
                min_chars = 5
            utils.filter_words(fa_fn,
                               list_fn,
                               min_frames=min_frames,
                               min_chars=min_chars)
        else:
            print("Using existing file:", list_fn)

        # Extract word segments from the MFCC NumPy archives
        input_npz_fn = path.join(feat_dir,
                                 args.language.lower() + "." + subset + ".npz")
        output_npz_fn = path.join(
            feat_dir,
            args.language.lower() + "." + subset + ".gt_words.npz")
        if not path.isfile(output_npz_fn):
            print("Extracting MFCCs for ground truth word tokens:", subset)
            utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn)
        else:
            print("Using existing file:", output_npz_fn)

    # UTD-DISCOVERED WORD SEGMENTS

    # Change Enno Hermann's pair file to the appropriate format
    enno_pairs_fn = path.join(
        "..",
        "data",
        args.language,  # "pairs_sw_utd.train"
        "pairs_sw_utd_plp_vtln.train")
    if not path.isfile(enno_pairs_fn):
        # This might not be an evaluation language
        return
    pairs_fn = path.join("lists", args.language, "train.utd_pairs.list")
    if not path.isfile(pairs_fn):
        utils.format_enno_pairs(enno_pairs_fn, pairs_fn)
    else:
        print("Using existing file:", pairs_fn)
    list_fn = path.join("lists", args.language, "train.utd_terms.list")
    if not path.isfile(list_fn):
        print("Reading:", pairs_fn)
        terms = set()
        with codecs.open(pairs_fn, "r", "utf-8") as pairs_f:
            for line in pairs_f:
                term1, term2 = line.strip().split(" ")
                terms.add(term1)
                terms.add(term2)
        print("Writing:", list_fn)
        with codecs.open(list_fn, "w", "utf-8") as list_f:
            for term in sorted(terms):
                list_f.write(term + "\n")
    else:
        print("Using existing file:", list_fn)

    # Extract UTD segments
    input_npz_fn = path.join(feat_dir, args.language.lower() + ".train.npz")
    output_npz_fn = path.join(feat_dir,
                              args.language.lower() + ".train.utd_terms.npz")
    if not path.isfile(output_npz_fn):
        print("Extracting MFCCs for UTD word tokens")
        utils.segments_from_npz(input_npz_fn, list_fn, output_npz_fn)
    else:
        print("Using existing file:", output_npz_fn)

    # UTD SEGMENTS THAT HAVE BEEN PARTIALLY FIXED

    # Write list with fixed labels and segments
    fixed_labels_list_fn = path.join("lists", args.language,
                                     "train.utd_terms.fixed_labels.list")
    fixed_segs_list_fn = path.join("lists", args.language,
                                   "train.utd_terms.fixed_segs.list")
    fixed_labels_segs_list_fn = path.join(
        "lists", args.language, "train.utd_terms.fixed_labels_segs.list")
    if (not path.isfile(fixed_labels_list_fn)
            or not path.isfile(fixed_labels_segs_list_fn)
            or not path.isfile(fixed_segs_list_fn)):

        # Read UTD terms
        utd_list_fn = path.join("lists", args.language, "train.utd_terms.list")
        print("Reading:", utd_list_fn)
        # overlap_dict[speaker_utt][(start, end)] is a tuple of
        # (label, (start, end), overlap, cluster_label)
        overlap_dict = {}
        with codecs.open(utd_list_fn, "r", "utf-8") as utd_list_f:
            for line in utd_list_f:
                term, speaker, utt, start_end = line.strip().split("_")
                start, end = start_end.split("-")
                start = int(start)
                end = int(end)
                if not speaker + "_" + utt in overlap_dict:
                    overlap_dict[speaker + "_" + utt] = {}
                overlap_dict[speaker + "_" + utt][(start,
                                                   end)] = ("label", (0, 0), 0,
                                                            term)

        # Read forced alignments
        fa_fn = path.join(gp_alignments_dir, args.language, subset + ".ctm")
        print("Reading:", fa_fn)
        fa_dict = {}
        with codecs.open(fa_fn, "r", "utf-8") as fa_f:
            for line in fa_f:
                utt_key, _, start, duration, label = line.strip().split()
                start = float(start)
                duration = float(duration)
                end = start + duration
                start_frame = int(round(start * 100))
                end_frame = int(round(end * 100))
                if (label != "<unk>" and label != "sil" and label != "?"
                        and label != "spn"):
                    if not utt_key in fa_dict:
                        fa_dict[utt_key] = {}
                    fa_dict[utt_key][start_frame, end_frame] = label

        # Find ground truth terms with maximal overlap
        print("Getting ground truth terms with maximal overlap:")
        for utt_key in tqdm(fa_dict):
            # print(utt_key)
            if utt_key not in overlap_dict:
                continue
            for (fa_start, fa_end) in fa_dict[utt_key]:
                for (utd_start, utd_end) in overlap_dict[utt_key]:
                    overlap = get_overlap(utd_start, utd_end, fa_start, fa_end)
                    if overlap == 0:
                        continue
                    if (overlap >
                            overlap_dict[utt_key][(utd_start, utd_end)][2]):
                        overlap_dict[utt_key][(utd_start, utd_end)] = (
                            fa_dict[utt_key][(fa_start, fa_end)],
                            (fa_start, fa_end), overlap,
                            overlap_dict[utt_key][(utd_start, utd_end)][3])

        # Write list with fixed labels
        if not path.isfile(fixed_labels_list_fn):
            print("Writing:", fixed_labels_list_fn)
            with codecs.open(fixed_labels_list_fn, "w", "utf-8") as list_f:
                for utt_key in sorted(overlap_dict):
                    for (utd_start, utd_end) in overlap_dict[utt_key]:
                        label = overlap_dict[utt_key][(utd_start, utd_end)][0]
                        overlap = (overlap_dict[utt_key][(utd_start,
                                                          utd_end)][2])
                        if overlap != 0:
                            list_f.write("{}_{}_{:06d}-{:06d}\n".format(
                                label, utt_key, utd_start, utd_end))
        else:
            print("Using existing file:", fixed_labels_list_fn)

        # Write list with fixed labels and segment intervals
        if not path.isfile(fixed_labels_segs_list_fn):
            print("Writing:", fixed_labels_segs_list_fn)
            with (codecs.open(fixed_labels_segs_list_fn, "w",
                              "utf-8")) as list_f:
                for utt_key in sorted(overlap_dict):
                    for (utd_start, utd_end) in overlap_dict[utt_key]:
                        label = overlap_dict[utt_key][(utd_start, utd_end)][0]
                        fa_start, fa_end = (overlap_dict[utt_key][(
                            utd_start, utd_end)][1])
                        overlap = (overlap_dict[utt_key][(utd_start,
                                                          utd_end)][2])
                        if overlap != 0:
                            list_f.write("{}_{}_{:06d}-{:06d}\n".format(
                                label, utt_key, fa_start, fa_end))
        else:
            print("Using existing file:", fixed_labels_segs_list_fn)

        # Write list with fixed segment intervals
        if not path.isfile(fixed_segs_list_fn):
            print("Writing:", fixed_segs_list_fn)
            with (codecs.open(fixed_segs_list_fn, "w", "utf-8")) as list_f:
                for utt_key in sorted(overlap_dict):
                    for (utd_start, utd_end) in overlap_dict[utt_key]:
                        label = overlap_dict[utt_key][(utd_start, utd_end)][3]
                        fa_start, fa_end = (overlap_dict[utt_key][(
                            utd_start, utd_end)][1])
                        overlap = (overlap_dict[utt_key][(utd_start,
                                                          utd_end)][2])
                        if overlap != 0:
                            list_f.write("{}_{}_{:06d}-{:06d}\n".format(
                                label, utt_key, fa_start, fa_end))
        else:
            print("Using existing file:", fixed_segs_list_fn)

    else:
        print("Using existing file:", fixed_labels_list_fn)
        print("Using existing file:", fixed_segs_list_fn)
        print("Using existing file:", fixed_labels_segs_list_fn)

    # Extract UTD with fixed labels
    input_npz_fn = path.join(feat_dir, args.language.lower() + ".train.npz")
    output_npz_fn = path.join(
        feat_dir,
        args.language.lower() + ".train.utd_terms.fixed_labels.npz")
    if not path.isfile(output_npz_fn):
        print("Extracting MFCCs for UTD tokens with fixed labels")
        utils.segments_from_npz(input_npz_fn, fixed_labels_list_fn,
                                output_npz_fn)
    else:
        print("Using existing file:", output_npz_fn)

    # Extract UTD with fixed segment intervals
    input_npz_fn = path.join(feat_dir, args.language.lower() + ".train.npz")
    output_npz_fn = path.join(
        feat_dir,
        args.language.lower() + ".train.utd_terms.fixed_segs.npz")
    if not path.isfile(output_npz_fn):
        print("Extracting MFCCs for UTD tokens with fixed labels and segment "
              "intervals")
        utils.segments_from_npz(input_npz_fn, fixed_segs_list_fn,
                                output_npz_fn)
    else:
        print("Using existing file:", output_npz_fn)

    # Extract UTD with fixed labels and segment intervals
    input_npz_fn = path.join(feat_dir, args.language.lower() + ".train.npz")
    output_npz_fn = path.join(
        feat_dir,
        args.language.lower() + ".train.utd_terms.fixed_labels_segs.npz")
    if not path.isfile(output_npz_fn):
        print("Extracting MFCCs for UTD tokens with fixed labels and segment "
              "intervals")
        utils.segments_from_npz(input_npz_fn, fixed_labels_segs_list_fn,
                                output_npz_fn)
    else:
        print("Using existing file:", output_npz_fn)