def write_manifest(frame, name):
    length = frame.shape[0]
    for pos in range(length):
        print(pos, " von ", length)
        manifest["id"].append(frame.values[pos][0])
        manifest["audio"].append(frame.values[pos][1])
        manifest["n_frames"].append(frame.values[pos][2])
        manifest["tgt_text"].append(frame.values[pos][3])
    df_manifest = pd.DataFrame.from_dict(manifest)
    save_df_to_tsv(df_manifest, Path(rootpath) / f"{name}.tsv")
def generate_manifest(split, manifest):
    df = pd.DataFrame.from_dict(manifest)
    save_df_to_tsv(df, Path(root_path_data) / f"{split}_{task}.tsv")
Example #3
0
        counter_test = 0
        for audio in os.listdir(path):
            path_id = path + "/" + audio
            id = audio.replace(dialect + "_", "").replace(".wav", "")
            if counter % 2 == 0 and counter_test < split_size:
                audio_processing(path_id, folder, audio, int(id), test)
                counter_test = counter_test + 1
            if counter % 3 == 0 and counter_dev < split_size:
                audio_processing(path_id, folder, audio, int(id), dev)
                counter_dev = counter_dev + 1
            else:
                audio_processing(path_id, folder, audio, int(id), train)
            counter = counter + 1

df = pd.DataFrame.from_dict(train)
save_df_to_tsv(df, Path(root) / f"train_st_ch_de.tsv")
df = pd.DataFrame.from_dict(test)
save_df_to_tsv(df, Path(root) / f"test_st_ch_de.tsv")
df = pd.DataFrame.from_dict(dev)
save_df_to_tsv(df, Path(root) / f"dev_st_ch_de.tsv")

spm_filename_prefix = f"spm_char_st_ch_de"
# Generate config YAML
gen_config_yaml(
    Path(root),
    spm_filename_prefix + ".model",
    yaml_filename=f"config_st_ch_de.yaml",
    specaugment_policy="lb",
)
# generating vocabulary
if len(train_text) > 0:
Example #4
0
def generate_manifest(manifest, path):
    df = pd.DataFrame.from_dict(manifest)
    save_df_to_tsv(df, Path(path))
Example #5
0
import pandas as pd
from data_utils import save_df_to_tsv
from pathlib import Path

MANIFEST_COLUMNS = ["id", "audio", "n_frames", "tgt_text"]
manifest = {c: [] for c in MANIFEST_COLUMNS}
rootpath = "/Users/bogumiladubel/Documents/BA/data/st/eth_swiss_dialects/split_dialect/swissdial/"

frame = pd.read_csv(rootpath + "test_diff.csv")

length = frame.shape[0]
for pos in range(length):
    print(pos, " von ", length)
    manifest["id"].append(frame.values[pos][1])
    manifest["audio"].append(frame.values[pos][2])
    manifest["n_frames"].append(frame.values[pos][3])
    manifest["tgt_text"].append(frame.values[pos][4])
df_manifest = pd.DataFrame.from_dict(manifest)
name = "test_diff_zh"
save_df_to_tsv(df_manifest, Path(rootpath) / f"{name}.tsv")
Example #6
0
def save_manifest(file, manifest):
    df = pd.DataFrame.from_dict(manifest)
    save_df_to_tsv(df, Path(root_path_data) / f"{file}.tsv")
Example #7
0
    return transcript.strip()


for t in range(df.shape[0]):
    if t % 1000 == 0:
        print(t, " von ", df.shape[0])
    manifest["id"].append(df.values[t][0])
    manifest["audio"].append(df.values[t][1])
    manifest["n_frames"].append(df.values[t][2])
    target = preprocess_transcript(str(df.values[t][3]))
    train_text.append(target)
    manifest["tgt_text"].append(target)
    manifest["speaker"].append(df.values[t][4])

df = pd.DataFrame.from_dict(manifest)
save_df_to_tsv(df, Path(rootpath) / f"{file}")


def gen_voc(train_text, spm_filename_prefix):
    f = open(Path(root_path_data) / "test.txt", "a")
    for t in train_text:
        f.write(" ".join(t) + "\n")
    print(f.name)
    gen_vocab(Path(f.name), Path(root_path_data) / spm_filename_prefix)


task = "asr_de"
spm_filename_prefix = f"spm_char_{task}"
# Generate config YAML
gen_config_yaml(
    Path(root_path_data),