Example #1
0
def batch_sequence(stop_event,
                   queue,
                   data,
                   feature_list,
                   features2spk,
                   batch_size=128,
                   min_len=200,
                   max_len=400,
                   shuffle=True,
                   seed=0):
    """Load features and fill a queue. Used in KaldiDataSeqQueue.

    Args:
        stop_event: An event indicating the reading is finished.
        queue: A queue to put the data.
        data: The kaldi data directory.
        feature_list: A list shows which features the process should read.
        features2spk: A dict map features to speaker index.
        batch_size: The batch_size
        min_len: The minimum length of the features.
        max_len: The maximum length of the features.
        shuffle: Load the feature from the 0-th frame or a random frame.
        seed: The number is used to generate a random seed
    """
    # Read the comment in batch_random
    rd = random.Random(os.urandom(4))
    rd.seed(seed)

    # rd.jumpahead(seed)

    feature_reader = FeatureReader(data)
    num_batches = int(len(feature_list) / batch_size)
    for i in range(num_batches):
        batch_length = rd.randint(min_len, max_len)

        # In some cases, the minimum length of the utterances is smaller than the batch length.
        # Use the smallest length as the real batch length.
        for j in range(batch_size):
            if feature_reader.utt2num_frames[feature_list[
                    i * batch_size + j].split(' ')[0]] < batch_length:
                batch_length = feature_reader.utt2num_frames[feature_list[
                    i * batch_size + j].split(' ')[0]]

        features = np.zeros((batch_size, batch_length, feature_reader.dim),
                            dtype=np.float32)
        labels = np.zeros((batch_size), dtype=np.int32)
        for j in range(batch_size):
            features[j, :, :], _ = feature_reader.read_segment(
                feature_list[i * batch_size + j],
                batch_length,
                shuffle=shuffle)
            labels[j] = features2spk[feature_list[i * batch_size + j]]
        queue.put((features, labels))
    stop_event.set()
    print("The process {} is about to exit.".format(os.getpid()))
    return
Example #2
0
    # Load the pre-trained model to the target model directory.
    # The pre-trained model will be copied as the fine-tuned model and can be loaded from the new directory.
    # The pre-trained model is now just like an initialized model.
    get_pretrain_model(os.path.join(args.pretrain_model, "nnet"),
                       os.path.join(args.finetune_model, "nnet"),
                       args.checkpoint)

    # The model directory always has a folder named nnet
    model_dir = os.path.join(args.finetune_model, "nnet")

    # Set the random seed. The random operations may appear in data input, batch forming, etc.
    tf.set_random_seed(params.seed)
    random.seed(params.seed)
    np.random.seed(params.seed)

    dim = FeatureReader(args.train_dir).get_dim()
    if "selected_dim" in params.dict:
        dim = params.selected_dim

    with open(os.path.join(model_dir, "feature_dim"), "w") as f:
        f.write("%d\n" % dim)

    num_total_train_speakers = KaldiDataRandomQueue(args.train_dir, args.train_spklist).num_total_speakers
    tf.logging.info("There are %d speakers in the training set and the dim is %d" % (num_total_train_speakers, dim))

    min_valid_loss = ValidLoss()

    # The trainer is used to control the training process
    trainer = Trainer(params, args.finetune_model, dim, num_total_train_speakers)
    trainer.build("train")
    trainer.build("valid")
Example #3
0
parser.add_argument("model", type=str, help="The output model directory.")

if __name__ == '__main__':
    tf.logging.set_verbosity(tf.logging.INFO)
    args = parser.parse_args()
    params = save_codes_and_config(True, args.model, None)

    # The model directory always has a folder named nnet
    model_dir = os.path.join(args.model, "nnet")

    # Set the random seed. The random operations may appear in data input, batch forming, etc.
    tf.set_random_seed(params.seed)
    random.seed(params.seed)
    np.random.seed(params.seed)

    dim = FeatureReader(args.data_dir).get_dim()
    if "selected_dim" in params.dict:
        dim = params.selected_dim

    with open(args.data_spklist, 'r') as f:
        num_total_train_speakers = len(f.readlines())
    trainer = Trainer(params, args.model, dim, num_total_train_speakers)
    trainer.build("valid")
    valid_loss, valid_embeddings, valid_labels = trainer.insight(
        args.data_dir,
        args.data_spklist,
        batch_type=params.batch_type,
        output_embeddings=True)
    eer = compute_cos_pairwise_eer(valid_embeddings, valid_labels)
    tf.logging.info("EER: %f" % eer)
    trainer.close()
Example #4
0
def batch_random(stop_event,
                 queue,
                 data,
                 spk2features,
                 num_total_speakers,
                 num_speakers=10,
                 num_segments=10,
                 min_len=200,
                 max_len=400,
                 shuffle=True,
                 seed=0):
    """Load features and fill a queue. Used in KaldiDataRandomQueue

    Args:
        stop_event: An event to tell the process to stop.
        queue: A queue to put the data.
        data: The kaldi data directory.
        spk2features: A dict from speaker index to the segments.
        num_total_speakers: The total number of speakers.
        num_speakers: The number of speakers in the batch.
        num_segments: The number of segments per speaker.
        min_len: The minimum length of the features.
        max_len: The maximum length of the features.
        shuffle: Load the feature from the 0-th frame or a random frame.
        seed: The value used to generate the random seed.
    """
    # TODO: If you use numpy.random in the sub-process, it is better to use:
    # local_state = np.random.RandomState(seed)
    # print local_state.uniform(0, 1, 5)
    #
    # The re-seed is necessary if numpy.random is used
    # You can use os.urandom to generate the `random` seed.
    rd = random.Random(os.urandom(4))
    rd.seed(seed)

    feature_reader = FeatureReader(data)
    speakers = list(spk2features.keys())  # 7323
    if num_total_speakers < num_speakers:
        print(
            "[Warning] The number of available speakers are less than the required speaker. Some speakers will be duplicated."
        )
        speakers = speakers * (int(num_speakers / num_total_speakers) + 1)
    # Now we have enough speakers
    while not stop_event.is_set():
        batch_speakers = rd.sample(speakers, num_speakers)  # 为选出的spk_id
        batch_length = rd.randint(
            min_len, max_len)  # 在min_len 200 和max_len 400之间随机选择一个batch_length
        features = np.zeros(
            (num_speakers * num_segments, batch_length, feature_reader.dim),
            dtype=np.float32)  # (batch_size, frame_length, feat_dim)
        labels = np.zeros((num_speakers * num_segments),
                          dtype=np.int32)  # (batch_size)
        for i, speaker in enumerate(batch_speakers):
            # The length may be larger than the utterance length. A check should be applied first.
            feature_list = []
            spk = speaker
            while len(feature_list) == 0:
                feature_list = []
                for feat in spk2features[spk]:
                    if feature_reader.utt2num_frames[feat.split(' ')
                                                     [0]] > batch_length:
                        feature_list.append(feat)
                if len(feature_list) == 0:
                    # The speaker is not appropriate for this batch. Resample the speaker
                    spk = rd.choice(list(set(speakers) - set(batch_speakers)))
                    batch_speakers[i] = spk

            labels[i * num_segments:(i + 1) * num_segments] = spk
            # If the number is not enough
            if len(feature_list) < num_segments:
                feature_list *= (int(num_segments / len(feature_list)) + 1
                                 )  # 对现有的list进行复制
            # Now the length of the list must be greater than the sample size.
            speaker_features = rd.sample(
                feature_list, num_segments
            )  # 从现有该说话人的feature_list中选出num_segments句作为speaker features
            for j, feat in enumerate(speaker_features):
                features[i * num_segments +
                         j, :, :], _ = feature_reader.read_segment(
                             feat, batch_length, shuffle=shuffle)
        queue.put((features, labels))

    time.sleep(3)
    while not queue.empty():
        try:
            queue.get(block=False)
        except:
            pass
    print("The process {} is about to exit.".format(os.getpid()))
    return
import tensorflow as tf

if __name__ == '__main__':
    tf.reset_default_graph()
    tf.logging.set_verbosity(tf.logging.INFO)
    nnet_dir = os.path.join(args.model_dir, "nnet")
    config_json = os.path.join(args.model_dir, "nnet/config.json")
    if not os.path.isfile(config_json):
        sys.exit("Cannot find params.json in %s" % config_json)
    params = Params(config_json)

    # First, we need to extract the weights
    num_total_train_speakers = KaldiDataRandomQueue(
        os.path.dirname(args.spklist), args.spklist).num_total_speakers
    dim = FeatureReader(os.path.dirname(args.spklist)).get_dim()
    if "selected_dim" in params.dict:
        dim = params.selected_dim
    trainer = Trainer(params,
                      args.model_dir,
                      dim,
                      num_total_train_speakers,
                      single_cpu=True)
    trainer.build("valid")
    trainer.sess.run(tf.global_variables_initializer())
    trainer.sess.run(tf.local_variables_initializer())

    if not args.init:
        curr_step = trainer.load()
    else:
        # Hack: