Ejemplo n.º 1
0
                        help="# of process for Multiprocessing",
                        default=8,
                        type=int)

    args = parser.parse_args()

    assert os.path.isdir(str(
        args.wsj0)), "WSJ0 directory not found - '{d}'".format(d=args.wsj0)
    assert os.path.isdir(str(
        args.wsj1)), "WSJ1 directory not found - '{d}'".format(d=args.wsj1)
    assert args.wsj0 != args.wsj1, "WSJ0 and WSJ1 directories can't be the same"
    assert os.path.exists(
        args.sph2pipe), "sph2pipe not found '{d}'".format(d=args.sph2pipe)

    transcripts = {}
    utils.find_transcripts(args.wsj0, transcripts)
    utils.find_transcripts(args.wsj1, transcripts)

    sets = {}
    sets["si84"] = utils.ndx2idlist(
        args.wsj0,
        "11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx",
        transcripts,
        lambda line: None if "11_2_1:wsj0/si_tr_s/401" in line else line,
    )
    assert len(sets["si84"]) == 7138

    sets["si284"] = utils.ndx2idlist(
        args.wsj0,
        "11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx",
        transcripts,
Ejemplo n.º 2
0
                        default=8,
                        type=int)

    args = parser.parse_args()
    wsj1_sep = "-" if args.wsj1_type == "LDC94S13A" else "_"

    assert os.path.isdir(str(
        args.wsj0)), "WSJ0 directory is not found - '{d}'".format(d=args.wsj0)
    assert os.path.isdir(str(
        args.wsj1)), "WSJ1 directory is not found - '{d}'".format(d=args.wsj1)
    assert args.wsj0 != args.wsj1, "WSJ0 and WSJ1 directories can't be the same"
    assert os.path.exists(
        args.sph2pipe), "sph2pipe not found '{d}'".format(d=args.sph2pipe)

    # Prepare audio data
    transcripts = find_transcripts([args.wsj0, args.wsj1])

    subsets = dict()
    subsets["si84"] = ndx_to_samples(
        args.wsj0,
        "11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx",
        transcripts,
        lambda line: None if "11_2_1:wsj0/si_tr_s/401" in line else line,
    )
    assert len(
        subsets["si84"]) == 7138, "Incorrect number of samples in si84 part:"
    " should be 7138, but fould #{}.".format(len(subsets["si84"]))

    subsets["si284"] = ndx_to_samples(
        args.wsj0,
        "11-13.1/wsj0/doc/indices/train/tr_s_wv1.ndx",