Beispiel #1
0
def train_nn(speech_data, speech_alignment):
    print
    print datetime.datetime.now()
    print
    random.seed(0)
    try:
        f = open(fetures_file_name, "rb")
        crossvalid_x = np.load(f)
        crossvalid_y = np.load(f)
        train_x = np.load(f)
        train_y = np.load(f)
        tx_m = np.load(f)
        tx_std = np.load(f)
        f.close()
    except IOError:
        crossvalid_x, crossvalid_y, train_x, train_y, tx_m, tx_std = gen_features(
            speech_data, speech_alignment)

    input_size = train_x.shape[1] * (prev_frames + 1 + next_frames)
    output_size = np.amax(train_y) + 1

    print "The shape of non-multiplied training data: ", train_x.shape, train_y.shape
    print "The shape of non-multiplied test data:     ", crossvalid_x.shape, crossvalid_y.shape

    # add prev, and last frames
    tx_m = np.tile(tx_m, prev_frames + 1 + next_frames)
    tx_std = np.tile(tx_std, prev_frames + 1 + next_frames)

    print
    print datetime.datetime.now()
    print
    print "prev_frames +1+ last_frames:", prev_frames, 1, next_frames
    print "The shape of training data: ", train_x.shape, train_y.shape
    print "The shape of test data:     ", crossvalid_x.shape, crossvalid_y.shape
    print "The shape of tx_m, tx_std:  ", tx_m.shape, tx_std.shape
    print "The output size:            ", output_size
    print

    e = tffnn.TheanoFFNN(input_size,
                         hidden_units,
                         hidden_layers,
                         output_size,
                         hidden_activation=hact,
                         weight_l2=weight_l2,
                         training_set_x=train_x,
                         training_set_y=train_y,
                         prev_frames=prev_frames,
                         next_frames=next_frames,
                         amplify_center_frame=amplify_center_frame,
                         batch_size=batch_size)
    e.set_input_norm(tx_m, tx_std)

    dc_acc = []
    dt_acc = []

    epoch = 0
    i_hidden_layers = hidden_layers

    while True:

        print
        print '-' * 80
        print 'Predictions'
        print '-' * 80
        predictions_y, gold_y = e.predict(crossvalid_x, batch_size,
                                          prev_frames, next_frames,
                                          crossvalid_y)
        c_acc, c_sil = get_accuracy(gold_y, predictions_y)
        predictions_y, gold_y = e.predict(train_x, batch_size, prev_frames,
                                          next_frames, train_y)
        t_acc, t_sil = get_accuracy(gold_y, predictions_y)

        dc_acc.append(c_acc)
        dt_acc.append(t_acc)

        print
        print "method, hact, max_frames, max_files, max_frames_per_segment, trim_segments, batch_size, max_epoch, " \
              "hidden_units, hidden_layers, hidden_layers_add, prev_frames, next_frames, crossvalid_frames, usec0, usedelta, useacc, mel_banks_only "
        print method, hact, max_frames, max_files, max_frames_per_segment, trim_segments, batch_size, max_epoch, \
            hidden_units, hidden_layers, hidden_layers_add, prev_frames, next_frames, crossvalid_frames, usec0, usedelta, useacc, mel_banks_only
        print "Epoch: %d" % (epoch, )
        print
        print "Cross-validation stats"
        print "------------------------"
        print "Epoch predictive accuracy:  %0.4f" % c_acc
        print "Last epoch accs:", ["%.4f" % x for x in dc_acc[-20:]]
        print "Epoch sil bias: %0.2f" % c_sil
        print
        print "Training stats"
        print "------------------------"
        print "Epoch predictive accuracy:  %0.4f" % t_acc
        print "Last epoch accs:", ["%.4f" % x for x in dt_acc[-20:]]
        print "Epoch sil bias: %0.2f" % t_sil
        print
        print "Best results"
        print "------------------------"
        print "Best iteration:", np.argmax(dc_acc)
        print "Best iteration - cross-validation acc: %.4f" % dc_acc[np.argmax(
            dc_acc)]
        print "Best iteration - training acc:         %.4f" % dt_acc[np.argmax(
            dc_acc)]
        print
        print datetime.datetime.now()
        print

        if epoch == np.argmax(dc_acc):
            print
            print "Saving the FFNN and TFFN models"
            print

            file_name = "model_voip/lid_nnt_%d_hu%d_hl%d_hla%d_pf%d_nf%d_acf_%.1f_mfr%d_mfl%d_mfps%d_ts%d_usec0%d_usedelta%d_useacc%d_mbo%d_bs%d" % \
                                 (input_size, hidden_units, hidden_layers, hidden_layers_add,
                                  prev_frames, next_frames, amplify_center_frame, max_frames, max_files, max_frames_per_segment,
                                  trim_segments,
                                  usec0, usedelta, useacc, mel_banks_only, batch_size)
            nn = ffnn.FFNN()
            for w, b in [
                    e.params[n:n + 2] for n in range(0, len(e.params), 2)
            ]:
                nn.add_layer(w.get_value(), b.get_value())
            nn.set_input_norm(tx_m, tx_std)
            nn.save(file_name + ".ffnn")

            e.save(file_name + ".tffnn")

        if epoch == max_epoch:
            break
        epoch += 1

        if epoch > 1 and epoch % 1 == 0:
            if i_hidden_layers < hidden_layers + hidden_layers_add:
                print
                print '-' * 80
                print 'Adding a hidden layer: ', i_hidden_layers + 1
                print '-' * 80
                e.add_hidden_layer(hidden_units)
                i_hidden_layers += 1

        print
        print '-' * 80
        print 'Training'
        print '-' * 80
        e.train(method=method,
                learning_rate=learning_rate * learning_rate_decay /
                (learning_rate_decay + epoch))
Beispiel #2
0
def train_nn(speech_data, speech_alignment):
    vta = MLFMFCCOnlineAlignedArray(usec0=usec0,
                                    n_last_frames=last_frames,
                                    usedelta=usedelta,
                                    useacc=useacc,
                                    mel_banks_only=mel_banks_only)
    sil_count = 0
    speech_count = 0
    for sd, sa in zip(speech_data, speech_alignment):
        mlf_speech = load_mlf(sa, max_files, max_frames_per_segment)
        vta.append_mlf(mlf_speech)
        vta.append_trn(sd)

        sil_count += mlf_speech.count_length('sil')
        speech_count += mlf_speech.count_length('speech')

    print "The length of sil segments:    ", sil_count
    print "The length of speech segments: ", speech_count

    mfcc = vta.__iter__().next()

    print "MFCC length:", len(mfcc[0])
    input_size = len(mfcc[0])

    e = theanets.Experiment(
        theanets.Classifier,
        layers=(input_size, hidden_units, hidden_units, hidden_units,
                hidden_units, 2),
        optimize="hf",
        num_updates=30,
        validate=1,
        initial_lambda=0.1,
        preconditioner=True if preconditioner else False,
        hidden_dropouts=hidden_dropouts,
        weight_l2=weight_l2,
        batch_size=500,
    )

    random.seed(0)
    print "Generating the cross-validation and train MFCC features"
    crossvalid_x = []
    crossvalid_y = []
    train_x = []
    train_y = []
    i = 0
    for frame, label in vta:
        frame = frame - (10.0 if mel_banks_only else 0.0)

        if i % (max_frames / 10) == 0:
            print "Already processed: %.2f%% of data" % (100.0 * i /
                                                         max_frames)

        if i > max_frames:
            break

        if random.random() < float(crossvalid_frames) / max_frames:
            # sample validation (test) data
            crossvalid_x.append(frame)
            if label == "sil":
                crossvalid_y.append(0)
            else:
                crossvalid_y.append(1)
        else:
            train_x.append(frame)
            if label == "sil":
                train_y.append(0)
            else:
                train_y.append(1)

        i += 1

    crossvalid = [
        np.array(crossvalid_x),
        np.array(crossvalid_y).astype('int32')
    ]
    train = [np.array(train_x), np.array(train_y).astype('int32')]

    print
    print "The length of training data: ", len(train_x)
    print "The length of test data:     ", len(crossvalid_x)
    print

    dc_acc = deque(maxlen=20)
    dt_acc = deque(maxlen=20)

    epoch = 0
    while True:

        predictions_y = e.network.predict(crossvalid_x)
        c_acc, c_sil = get_accuracy(crossvalid_y, predictions_y)
        predictions_y = e.network.predict(train_x)
        t_acc, t_sil = get_accuracy(train_y, predictions_y)

        print
        print "max_frames, max_files, max_frames_per_segment, trim_segments, max_epoch, hidden_units, last_frames, crossvalid_frames, usec0, usedelta, useacc, mel_banks_only, preconditioner, hidden_dropouts, weight_l2"
        print max_frames, max_files, max_frames_per_segment, trim_segments, max_epoch, hidden_units, last_frames, crossvalid_frames, usec0, usedelta, useacc, mel_banks_only, preconditioner, hidden_dropouts, weight_l2
        print "Epoch: %d" % (epoch, )
        print
        print "Cross-validation stats"
        print "------------------------"
        print "Epoch predictive accuracy:  %0.2f" % c_acc
        print "Last epoch accs:", ["%.2f" % x for x in dc_acc]
        print "Epoch sil bias: %0.2f" % c_sil
        print
        print "Training stats"
        print "------------------------"
        print "Epoch predictive accuracy:  %0.2f" % t_acc
        print "Last epoch accs:", ["%.2f" % x for x in dt_acc]
        print "Epoch sil bias: %0.2f" % t_sil

        if epoch == max_epoch:
            break
        epoch += 1

        e.run(train, crossvalid)

        dc_acc.append(c_acc)
        dt_acc.append(t_acc)

        nn = ffnn.FFNN()
        for w, b in zip(e.network.weights, e.network.biases):
            nn.add_layer(w.get_value(), b.get_value())
        nn.save(file_name = "model_voip/vad_sds_mfcc_is%d_hu%d_lf%d_mfr%d_mfl%d_mfps%d_ts%d_usec0%d_usedelta%d_useacc%d_mbo%d.nn" % \
                            (input_size, hidden_units, last_frames, max_frames, max_files, max_frames_per_segment, trim_segments, usec0, usedelta, useacc, mel_banks_only))
Beispiel #3
0
def train_nn(speech_data, speech_alignment):
    print
    print datetime.datetime.now()
    print
    random.seed(0)
    try:
        f = open(fetures_file_name, "rb")
        crossvalid_x = np.load(f)
        crossvalid_y = np.load(f)
        train_x = np.load(f)
        train_y = np.load(f)
        f.close()

    except:
        vta = MLFMFCCOnlineAlignedArray(usec0=usec0,
                                        n_last_frames=last_frames,
                                        usedelta=usedelta,
                                        useacc=useacc,
                                        mel_banks_only=mel_banks_only)
        sil_count = 0
        speech_count = 0
        for sd, sa in zip(speech_data, speech_alignment):
            mlf_speech = load_mlf(sa, max_files, max_frames_per_segment)
            vta.append_mlf(mlf_speech)
            vta.append_trn(sd)

            sil_count += mlf_speech.count_length('sil')
            speech_count += mlf_speech.count_length('speech')

        print "The length of sil segments:    ", sil_count
        print "The length of speech segments: ", speech_count

        mfcc = vta.__iter__().next()

        print "MFCC length:", len(mfcc[0])
        input_size = len(mfcc[0])

        print "Generating the cross-validation and train MFCC features"
        crossvalid_x = []
        crossvalid_y = []
        train_x = []
        train_y = []
        i = 0
        for frame, label in vta:
            # downcast
            frame = frame.astype(np.float32)
            #        frame = frame - (10.0 if mel_banks_only else 0.0)

            if i % (max_frames / 10) == 0:
                print "Already processed: %.2f%% of data" % (100.0 * i /
                                                             max_frames)

            if i > max_frames:
                break

            if random.random() < float(crossvalid_frames) / max_frames:
                # sample validation (test) data
                crossvalid_x.append(frame)
                if label == "sil":
                    crossvalid_y.append(0)
                else:
                    crossvalid_y.append(1)
            else:
                train_x.append(frame)
                if label == "sil":
                    train_y.append(0)
                else:
                    train_y.append(1)

            i += 1

        gc.collect()
        crossvalid_x = np.array(crossvalid_x).astype(np.float32)
        gc.collect()
        crossvalid_y = np.array(crossvalid_y).astype('int32')
        gc.collect()
        train_x = np.array(train_x).astype(np.float32)
        gc.collect()
        train_y = np.array(train_y).astype('int32')
        gc.collect()

        # normalise the data
        tx_m = np.mean(train_x, axis=0)
        tx_std = np.std(train_x, axis=0)

        gc.collect()
        crossvalid_x -= tx_m
        gc.collect()
        crossvalid_x /= tx_std
        gc.collect()
        train_x -= tx_m
        gc.collect()
        train_x /= tx_std
        gc.collect()

        print 'Saving data to:', fetures_file_name
        f = open(fetures_file_name, "wb")
        np.save(f, crossvalid_x)
        np.save(f, crossvalid_y)
        np.save(f, train_x)
        np.save(f, train_y)
        f.close()

    print
    print datetime.datetime.now()
    print
    print "The shape of training data: ", train_x.shape, train_y.shape
    print "The shape of test data:     ", crossvalid_x.shape, crossvalid_y.shape
    print

    input_size = train_x.shape[1]

    e = ffnn.TheanoFFNN2(input_size,
                         hidden_units,
                         hidden_layers,
                         2,
                         hidden_activation=hact,
                         weight_l2=weight_l2)

    dc_acc = []
    dt_acc = []

    epoch = 0
    while True:

        print
        print '-' * 80
        print 'Predictions'
        print '-' * 80
        predictions_y = e.predict(crossvalid_x, batch_size)
        c_acc, c_sil = get_accuracy(crossvalid_y, predictions_y)
        predictions_y = e.predict(train_x, batch_size)
        t_acc, t_sil = get_accuracy(train_y, predictions_y)

        dc_acc.append(c_acc)
        dt_acc.append(t_acc)

        print
        print "method, hact, max_frames, max_files, max_frames_per_segment, trim_segments, batch_size, max_epoch, hidden_units, last_frames, crossvalid_frames, usec0, usedelta, useacc, mel_banks_only "
        print method, hact, max_frames, max_files, max_frames_per_segment, trim_segments, batch_size, max_epoch, hidden_units, last_frames, crossvalid_frames, usec0, usedelta, useacc, mel_banks_only
        print "Epoch: %d" % (epoch, )
        print
        print "Cross-validation stats"
        print "------------------------"
        print "Epoch predictive accuracy:  %0.4f" % c_acc
        print "Last epoch accs:", ["%.4f" % x for x in dc_acc[-20:]]
        print "Epoch sil bias: %0.2f" % c_sil
        print
        print "Training stats"
        print "------------------------"
        print "Epoch predictive accuracy:  %0.4f" % t_acc
        print "Last epoch accs:", ["%.4f" % x for x in dt_acc[-20:]]
        print "Epoch sil bias: %0.2f" % t_sil
        print
        print "Best results"
        print "------------------------"
        print "Best iteration:", np.argmax(dc_acc)
        print "Best iteration - cross-validation acc: %.4f" % dc_acc[np.argmax(
            dc_acc)]
        print "Best iteration - training acc:         %.4f" % dt_acc[np.argmax(
            dc_acc)]
        print
        print datetime.datetime.now()
        print

        if epoch == np.argmax(dc_acc):
            print
            print "Saving the FFNN model"
            print

            nn = ffnn.FFNN()
            for w, b in e.params:
                nn.add_layer(w.get_value(), b.get_value())
            nn.save(file_name = "model_voip/vad_sds_mfcc_is%d_hu%d_lf%d_mfr%d_mfl%d_mfps%d_ts%d_usec0%d_usedelta%d_useacc%d_mbo%d.nnt" % \
                                 (input_size, hidden_units, last_frames, max_frames, max_files, max_frames_per_segment, trim_segments,
                                 usec0, usedelta, useacc, mel_banks_only))

        if epoch == max_epoch:
            break
        epoch += 1

        print
        print '-' * 80
        print 'Training'
        print '-' * 80
        e.train(train_x,
                train_y,
                method=method,
                learning_rate=learning_rate * learning_rate_decay /
                (learning_rate_decay + epoch),
                batch_size=batch_size)