Exemple #1
0
 def test_load_config(self):
     ''' load config unittest '''
     conf = utils.load_config(self.conf_file)
     self.assertDictEqual(conf, self.conf_true)
Exemple #2
0
def main(argv):
  """
    main function
  """
  # pylint: disable=unused-argument

  if FLAGS.config != '':
    config = utils.load_config(FLAGS.config)
    utils.set_logging(FLAGS.log_debug, config)

    utils.copy_config(FLAGS.config, config)
    set_seed(config)
  else:
    config = None

  logging.info("Loading all modules ...")
  import_all_modules_for_register(config, only_nlp=FLAGS.only_nlp)

  logging.info("CMD: {}".format(FLAGS.cmd))
  if FLAGS.cmd == 'train' or FLAGS.cmd == 'train_and_eval' or \
    FLAGS.cmd == 'eval' or FLAGS.cmd == 'infer' or \
    FLAGS.cmd == 'export_model' or FLAGS.cmd == 'gen_feat' or \
    FLAGS.cmd == 'gen_cmvn':
    solver_name = config['solver']['name']
    solver = registers.solver[solver_name](config)
    # config after process
    config = solver.config
    task_name = config['data']['task']['name']
    task_class = registers.task[task_name]
    if FLAGS.cmd == 'train':
      solver.train()
    elif FLAGS.cmd == 'train_and_eval':
      solver.train_and_eval()
    elif FLAGS.cmd == 'eval':
      solver.eval()
    elif FLAGS.cmd == 'infer':
      solver.infer(yield_single_examples=False)
    elif FLAGS.cmd == 'export_model':
      solver.export_model()
    elif FLAGS.cmd == 'gen_feat':
      assert config['data']['task'][
          'suffix'] == '.npy', 'wav does not need to extractor feature'
      paths = []
      for mode in [utils.TRAIN, utils.EVAL, utils.INFER]:
        paths += config['data'][mode]['paths']
      task = task_class(config, utils.INFER)
      task.generate_feat(paths, dry_run=FLAGS.dry_run)
    elif FLAGS.cmd == 'gen_cmvn':
      logging.info(
          '''using infer pipeline to compute cmvn of train_paths, and stride must be 1'''
      )
      paths = config['data'][utils.TRAIN]['paths']
      segments = config['data'][utils.TRAIN]['segments']
      config['data'][utils.INFER]['paths'] = paths
      config['data'][utils.INFER]['segments'] = segments
      task = task_class(config, utils.INFER)
      task.generate_cmvn(dry_run=FLAGS.dry_run)
  elif FLAGS.cmd == 'build':
    build_dataset(FLAGS.name, FLAGS.dir)
  else:
    raise ValueError("Not support command: {}.".format(FLAGS.cmd))
Exemple #3
0
  def test_english(self):
    """ test text match task of english data """
    config = utils.load_config(self.config_file)
    max_seq_len = config["data"]["task"]["max_seq_len"]
    class_num = config["data"]["task"]["classes"]["num_classes"]
    batch_size = config["data"]["task"]["batch_size"]
    data_config = config["data"]
    task_config = data_config["task"]
    task_config["language"] = "english"
    task_config["split_by_space"] = False
    task_config["use_word"] = True
    task_config[
        "text_vocab"] = "egs/mock_text_match_data/text_match/v1/data/text_vocab.txt"
    task_config["need_shuffle"] = False

    # generate_mock_files(config)

    task = TextMatchTask(config, utils.TRAIN)

    # test offline data
    data = task.dataset()
    self.assertTrue("input_x_dict" in data and
                    "input_x_left" in data["input_x_dict"] and
                    "input_x_right" in data["input_x_dict"])
    self.assertTrue("input_y_dict" in data and
                    "input_y" in data["input_y_dict"])
    # with self.cached_session(use_gpu=False, force_gpu=False) as sess:
    #  sess.run(data["iterator"].initializer)
    with self.cached_session(use_gpu=False, force_gpu=False) as sess:
      sess.run([data["iterator"].initializer, data["iterator_len"].initializer])
      res = sess.run([
          data["input_x_dict"]["input_x_left"],
          data["input_x_dict"]["input_x_right"],
          data["input_y_dict"]["input_y"],
          data["input_x_len"]["input_x_left_len"],
          data["input_x_len"]["input_x_right_len"],
      ])
      logging.debug(res[0][0][:10])
      logging.debug(res[1][0])
      logging.debug(res[2][0])
      logging.debug(res[3])
      logging.debug(res[4])

      self.assertAllEqual(res[0][0][:10], [2, 3, 4, 5, 6, 0, 0, 0, 0, 0])
      self.assertEqual(np.shape(res[0]), (batch_size, max_seq_len))
      self.assertEqual(np.shape(res[1]), (batch_size, max_seq_len))
      self.assertEqual(np.shape(res[2]), (batch_size, class_num))
      self.assertEqual(np.shape(res[3]), (batch_size,))
      self.assertEqual(np.shape(res[4]), (batch_size,))
    # test online data
    export_inputs = task.export_inputs()
    self.assertTrue("export_inputs" in export_inputs and
                    "input_sent_left" in export_inputs["export_inputs"] and
                    "input_sent_right" in export_inputs["export_inputs"])

    input_sent_left = export_inputs["export_inputs"]["input_sent_left"]
    input_sent_right = export_inputs["export_inputs"]["input_sent_right"]
    input_x_left = export_inputs["model_inputs"]["input_x_left"]
    input_x_right = export_inputs["model_inputs"]["input_x_right"]
    with self.cached_session(use_gpu=False, force_gpu=False) as sess:
      # sess.run(data["iterator"].initializer)
      sess.run(data["iterator"].initializer)
      res1, res2 = sess.run(
          [input_x_left, input_x_right],
          feed_dict={
              input_sent_left: ["How should I approach forgiveness?"],
              input_sent_right: ["I got chickenpox as a child."]
          })
      logging.debug(res1[0][:10])
      logging.debug(res2[0][:10])
      self.assertAllEqual(res1[0][:10], [2, 3, 4, 5, 6, 0, 0, 0, 0, 0])
      self.assertAllEqual(res2[0][:10], [4, 7, 8, 9, 10, 11, 0, 0, 0, 0])
      self.assertEqual(np.shape(res1[0]), (max_seq_len,))
      self.assertEqual(np.shape(res2[0]), (max_seq_len,))
Exemple #4
0
  def setUp(self):
    super().setUp()
    self.conf_str = '''
    data:
      train:
        paths: null
        segments: null
      eval:
        paths: null
        segments: null
      infer:
        paths: null
        segments: null
      task:
        dummy: true # dummy inputs 
        name: AsrSeqTask
        type: asr # asr, tts
        audio:
          dry_run: false # not save feat
        src:
          max_len: 3000 # max length for frames
          subsampling_factor: 1
          preprocess_conf: null
        tgt:
          max_len: 100 # max length for target tokens
        vocab:
          type: char # char, bpe, wpm, word
          size: 3653 # vocab size in vocab_file
          path: '/nfs/cold_project/dataset/opensource/librispeech/espnet/egs/hkust/asr1/data/lang_1char/train_nodup_sp_units.txt' # path to vocab(default: 'vocab
        batch:
          batch_size: 32 # number of elements in a training batch
          batch_bins: 0 # maximum number of bins (frames x dim) in a trainin batch
          batch_frames_in: 0 # maximum number of input frames in a training batch
          batch_frames_out: 0 # maximum number of output frames in a training batch
          batch_frames_inout: 0 # maximum number of input+output frames in a training batch
          batch_strategy: auto # strategy to count maximum size of batch(support 4 values: "auto", "seq", "frame", "bin")
        batch_mode: false # ture, user control batch; false, `generate` will yeild one example 
        num_parallel_calls: 12
        num_prefetch_batch: 2
        shuffle_buffer_size: 200000
        need_shuffle: true
        sortagrad: true
        batch_sort_key: 'input' # shuffle, input, output for asr and tts, and sortagrad for asr
        num_batches: 0 # for debugging

    model:
      name: CTCAsrModel
      type: keras # raw, keras or eager model
      net:
        structure:
          encoder:
            name:
            filters: # equal number of cnn layers
            - 128
            - 512
            - 512
            filter_size: # equal number of cnn layers
            - [5, 3]
            - [5, 3]
            - [5, 3]
            filter_stride: # equal number of cnn layers
            - [1, 1]
            - [1, 1]
            - [1, 1]
            pool_size: # equal number of cnn layers
            - [4, 4]
            - [1, 2]
            - [1, 2]
            num_filters: 128
            linear_num: 786 # hidden number of linear layer
            cell_num: 128 # cell units of the lstm
            hidden1: 64 # number of hidden units of fully connected layer
            attention: false # whether to use attention, false mean use max-pooling
            attention_size: 128 # attention_size
            use_lstm_layer: false # whether to use lstm layer, false mean no lstm layer
            use_dropout: true # whether to use bn, dropout layer
            dropout_rate: 0.2
            use_bn: true # whether to use bn, dropout layer
          decoder:
            name: 
          attention:
            name:
    solver:
      name: AsrSolver
      quantization:
        enable: false # whether to quantization model
        quant_delay: 0 # Number of steps after which weights and activations are quantized during training
      adversarial:
        enable: false # whether to using adversiral training
        adv_alpha: 0.5 # adviseral alpha of loss
        adv_epslion: 0.1 # adviseral example epslion
      model_average:
        enable: false # use average model
        var_avg_decay: 0.99 # the decay rate of varaibles
      distilling:
        enable: false 
        name : Teacher
        loss : DistillationLoss
        temperature: 5
        alpha: 0.5
        teacher_model: null # fronzen_graph.pb 
      optimizer:
        name: adam
        epochs: 5 # maximum epochs
        loss: CTCLoss 
        label_smoothing: 0.0 # label smoothing rate
        learning_rate:
          rate: 0.0001 # learning rate of Adam optimizer
          type:  exp_decay # learning rate type
          decay_rate: 0.99  # the lr decay rate
          decay_steps: 100  # the lr decay_step for optimizer
        clip_global_norm: 3.0 # clip global norm
        multitask: False # whether is multi-task
        early_stopping: # keras early stopping
          enable: true
          monitor: val_loss
          min_delta: 0
          patience: 5
      metrics:
        pos_label: 1 # int, same to sklearn
        cals:
        - name: AccuracyCal
          arguments: null 
        - name: ConfusionMatrixCal
          arguments: null
        - name: PrecisionCal
          arguments:
            average: 'binary'
        - name: RecallCal
          arguments:
            average: 'binary'
        - name: F1ScoreCal
          arguments:
            average: 'binary'
      postproc:
          enbale: false
          name: EmoPostProc
          log_verbose: false 
          eval: true # compute metrics
          infer: true  # get predict results
          pred_path: null # None for `model_path`/infer, dumps infer output to this dir
          thresholds:
              - 0.5
          smoothing:
              enable: true
              count: 2
      saver:
        model_path: "ckpt/asr-seq/test"
        max_to_keep: 10
        save_checkpoints_steps: 100
        keep_checkpoint_every_n_hours: 10000
        checkpoint_every: 100 # the step to save checkpoint
        summary: false
        save_summary_steps: 100
        eval_on_dev_every_secs: 1
        print_every: 10
        resume_model_path: ""
      run_config:
        debug: false # use tfdbug
        tf_random_seed: null # 0-2**32; null is None, try to read data from /dev/urandom if available or seed from the clock otherwise
        allow_soft_placement: true
        log_device_placement: false
        intra_op_parallelism_threads: 10
        inter_op_parallelism_threads: 10
        allow_growth: true
        log_step_count_steps: 100 #The frequency, in number of global steps, that the global step/sec and the loss will be logged during training.
      run_options:
        trace_level: 3 # 0: no trace, 1: sotware trace, 2: hardware_trace, 3: full trace
        inter_op_thread_pool: -1
        report_tensor_allocations_upon_oom: true
    
    serving:
      enable: false 
      name : Evaluate
      model: null # saved model dir, ckpt dir, or frozen_model.pb
      inputs: 'inputs:0'
      outpus: 'softmax_output:0'   
    '''
    import_all_modules_for_register()
    tempdir = self.get_temp_dir()

    config_path = str(Path(tempdir).joinpath("asr_seq.yaml"))
    logging.info("config path: {}".format(config_path))
    with open(config_path, 'w', encoding='utf-8') as f:  #pylint: disable=invalid-name
      f.write(self.conf_str)

    self.config = utils.load_config(config_path)
    self.mode = utils.TRAIN
    self.batch_size = 4
    self.config['solver']['optimizer']['batch_size'] = self.batch_size

    #generate dummpy data
    nexamples = 10
    generate_json_data(self.config, self.mode, nexamples)
Exemple #5
0
    def test_chinese_split_by_space(self):
        """ test match task of chiniese data, split sentences by space"""

        config = utils.load_config(self.config_file)
        max_seq_len = config["data"]["task"]["max_seq_len"]
        class_num = config["data"]["task"]["classes"]["num_classes"]
        data_config = config["data"]
        task_config = data_config["task"]
        task_config["language"] = "chinese"
        task_config["split_by_space"] = False
        task_config["use_word"] = False

        # generate_mock_files(config)
        task = TextMatchTask(config, utils.TRAIN)

        # test offline data
        data = task.dataset()
        self.assertTrue("input_x_dict" in data
                        and "input_x_left" in data["input_x_dict"]
                        and "input_x_right" in data["input_x_dict"])
        self.assertTrue("input_y_dict" in data
                        and "input_y" in data["input_y_dict"])
        with self.session() as sess:
            sess.run([
                data["iterator"].initializer, data["iterator_len"].initializer
            ])
            res = sess.run([
                data["input_x_dict"]["input_x_left"],
                data["input_x_dict"]["input_x_right"],
                data["input_y_dict"]["input_y"],
                data["input_x_len"]["input_x_left_len"],
                data["input_x_len"]["input_x_right_len"],
            ])

            logging.debug(res[0][0])
            logging.debug(res[1][0])
            logging.debug(res[2][0])
            logging.debug(res[3])
            logging.debug(res[4])

            self.assertEqual(np.shape(res[0]), (16, max_seq_len))
            self.assertEqual(np.shape(res[1]), (16, max_seq_len))
            self.assertEqual(np.shape(res[2]), (16, class_num))
            self.assertEqual(np.shape(res[3]), (16, ))
            self.assertEqual(np.shape(res[4]), (16, ))

        # test online data

        export_inputs = task.export_inputs()
        self.assertTrue(
            "export_inputs" in export_inputs
            and "input_sent_left" in export_inputs["export_inputs"]
            and "input_sent_right" in export_inputs["export_inputs"])

        input_sent_left = export_inputs["export_inputs"]["input_sent_left"]
        input_sent_right = export_inputs["export_inputs"]["input_sent_right"]
        input_x_left = export_inputs["model_inputs"]["input_x_left"]
        input_x_right = export_inputs["model_inputs"]["input_x_right"]
        with self.session() as sess:
            sess.run(data["iterator"].initializer)
            res1, res2 = sess.run([input_x_left, input_x_right],
                                  feed_dict={
                                      input_sent_left: ["我很可爱。"],
                                      input_sent_right: ["中国很可爱。"]
                                  })
            logging.debug(res1[0])
            logging.debug(res2[0])
            self.assertEqual(np.shape(res1[0]), (max_seq_len, ))
            self.assertEqual(np.shape(res2[0]), (max_seq_len, ))
Exemple #6
0
  def test_run():
    os.environ['CUDA_VISIBILE_DEVICES'] = ''
    logging.set_verbosity(logging.DEBUG)

    main_root = os.environ['MAIN_ROOT']
    main_root = Path(main_root)
    config_file = main_root.joinpath('speaker-test.yml')
    config = utils.load_config(config_file)

    solver_name = config['solver']['name']
    solver = registers.solver[solver_name](config)

    # config after process
    config = solver.config

    test_samples = False
    if test_samples:
      config['data']['task']['suffix'] = '.wav'
      data = SpeakerClsTask(config, utils.TRAIN)
    else:
      config['data']['task']['suffix'] = '.npy'
      data = SpeakerClsTask(config, utils.TRAIN)

    num_samples = 0
    log_samples = False
    for inputs, texts, label, filename, clip_id, soft_labels in \
        data.generate_data():
      if log_samples:
        logging.info(
            "feat shape:{} \ntext: {} \nlabels:{} \nfilename:{} \nclip_id:{}\nsoft_labels:{}"
            .format(inputs.shape, texts, label, filename, clip_id, soft_labels))
      if num_samples % 100 == 0:
        logging.info('Processed %d samples.' % (num_samples))
      num_samples += 1

    logging.info('Processed %d samples.' % (num_samples))

    batch_size = 4
    config['solver']['optimizer']['batch_size'] = batch_size
    dataset = data.input_fn(utils.TRAIN, batch_size, 1)()
    features, labels = dataset.make_one_shot_iterator().get_next()
    samples = features['inputs']
    filenames = features['filepath']
    clip_ids = features['clipid']
    soft_labels = features['soft_labels']

    num_samples = 0
    log_samples = False
    with tf.Session() as sess:
      try:
        while True:
          batch_inputs, batch_labels, batch_files, batch_clipids, \
            labels_onehot, batch_soft_labels = \
            sess.run([samples, labels, filenames, clip_ids, tf.one_hot(labels, 2), soft_labels])
          if log_samples:
            logging.debug("feat shape: {}".format(batch_inputs.shape))
            logging.debug("labels: {}".format(batch_labels))
            logging.debug("filename: {}".format(batch_files))
            logging.debug("clip id: {}".format(batch_clipids))
            logging.debug("onehot: {}".format(labels_onehot))
            logging.debug("soft_labels: {}".format(batch_soft_labels))
          if num_samples % 100 == 0:
            logging.info('Processed %d samples.' % (num_samples))
          num_samples += 1
      except tf.errors.OutOfRangeError as ex:
        logging.info(ex)
    logging.info('Processed %d samples.' % (num_samples))
    def setUp(self):
        super().setUp()
        import_all_modules_for_register()
        self.conf_str = '''
    data:
      train:
        paths:
        - ''
      eval:
        paths:
        - ''
      infer:
        paths:
        - ''
      task:
        name: SpeakerClsTask
        data_type: KaldiDataDirectory
        suffix: .npy # file suffix
        audio:
          dry_run: false # not save feat
          # params
          clip_size: 3 # clip len in seconds
          stride: 0.5 # stride in ratio of clip_size
          sr: 8000 # sample rate
          winlen: 0.025 # window len
          winstep: 0.01 # window stride
          nfft: 512 # fft bins, default: 512
          lowfreq: 0
          highfreq: null # default: null, 200 points for 800 nfft, 400 points for 1600 nfft
          preemph: 0.97 # default: 0.97
          # extractor
          feature_extractor: tffeat # `tffeat` to use TF feature_extraction .so library, 'pyfeat' to python_speech_feature
          save_feat_path: null  # null for dump feat with same dir of wavs
          # fbank
          save_fbank: true # save fbank or power spec
          feature_size: 23 # extract feature size
          add_delta_deltas: false # delta deltas
          # log pwoer
          log_powspec: false # true, save log power spec; otherwise save power spec
          # cmvn
          cmvn: true # apply cmvn or generate cmvn
          cmvn_path: ./cmvn_speaker.npy # cmvn file
        classes:
          num: 2 
          vocab: null
        num_parallel_calls: 12
        num_prefetch_batch: 2
        shuffle_buffer_size: 200000
        need_shuffle: true

    model:
      name: SpeakerCRNNRawModel
      type: raw # raw, keras or eager model
      net:
        structure:
          embedding_size: 2
          filters: # equal number of cnn layers
          - 2
          filter_size: # equal number of cnn layers
          - [1, 1]
          filter_stride: # equal number of cnn layers
          - [1, 1]
          pool_size: # equal number of cnn layers
          - [8, 8]
          tdnn_contexts:
          - 3
          - 3
          tdnn_dims:
          - 128
          - 128
          num_filters: 2
          linear_num: 2 # hidden number of linear layer
          cell_num: 2 # cell units of the lstm
          hidden1: 2 # number of hidden units of fully connected layer
          attention: false # whether to use attention, false mean use max-pooling
          attention_size: 64 # attention_size
          use_lstm_layer: false # whether to use lstm layer, false mean no lstm layer
          use_dropout: true # whether to use bn, dropout layer
          dropout_rate: 0.2
          use_bn: true # whether to use bn, dropout layer

          score_threshold: 0.5 # threshold to predict POS example
          threshold: 3 # threshold to predict POS example

    solver:
      name: SpeakerSolver
      adversarial:
        enable: false # whether to using adversiral training
        adv_alpha: 0.5 # adviseral alpha of loss
        adv_epslion: 0.1 # adviseral example epslion
      model_average:
        enable: false # use average model
        var_avg_decay: 0.99 # the decay rate of varaibles
      optimizer:
        name: adam
        epochs: 5 # maximum epochs
        batch_size: 4 # number of elements in a training batch
        loss: CrossEntropyLoss
        label_smoothing: 0.0 # label smoothing rate
        learning_rate:
          rate: 0.0001 # learning rate of Adam optimizer
          type:  exp_decay # learning rate type
          decay_rate: 0.99  # the lr decay rate
          decay_steps: 100  # the lr decay_step for optimizer
        clip_global_norm: 3.0 # clip global norm
      metrics:
        pos_label: 1 # int, same to sklearn
        cals:
        - name: AccuracyCal
          arguments: null
        - name: ConfusionMatrixCal
          arguments: null
        - name: PrecisionCal
          arguments:
            average: 'binary'
        - name: RecallCal
          arguments:
            average: 'binary'
        - name: F1ScoreCal
          arguments:
            average: 'binary'
      postproc:
          name: SpeakerPostProc
          log_verbose: false
          eval: true # compute metrics
          infer: true  # get predict results
          pred_path: null # None for `model_path`/infer, dumps infer output to this dir
          thresholds:
              - 0.5
          smoothing:
              enable: true
              count: 2
      saver:
        model_path: "ckpt/emotion-speech-cls/test"
        max_to_keep: 10
        save_checkpoints_steps: 10
        keep_checkpoint_every_n_hours: 10000
        checkpoint_every: 10 # the step to save checkpoint
        summary: false
        save_summary_steps: 5
        eval_on_dev_every_secs: 1
        print_every: 10
        resume_model_path: ""
      run_config:
        debug: false # use tfdbug
        tf_random_seed: null # 0-2**32; null is None, try to read data from /dev/urandom if available or seed from the clock otherwise
        allow_soft_placement: true
        log_device_placement: false
        intra_op_parallelism_threads: 1
        inter_op_parallelism_threads: 1
        allow_growth: true
        log_step_count_steps: 1 #The frequency, in number of global steps, that the global step/sec and the loss will be logged during training.
      distilling:
        enable: false
        name : Teacher
        loss : DistillationLoss
        temperature: 5
        alpha: 0.5
        teacher_model: ''

    serving:
      enable: true
      name : Evaluate
      model: '' # saved model dir, ckpt dir, or frozen_model.pb
      inputs: 'inputs:0'
      outpus: 'softmax_output:0'
    '''

        # write config to file
        tempdir = self.get_temp_dir()
        #tempdir = 'bar'
        os.makedirs(tempdir, exist_ok=True)

        config_path = str(Path(tempdir).joinpath('speaker_task.yaml'))
        logging.info("config path: {}".format(config_path))
        with open(config_path, 'w', encoding='utf-8') as f:  #pylint: disable=invalid-name
            f.write(self.conf_str)

        # load config
        config = utils.load_config(config_path)
        logging.info("config: {}".format(config))

        # edit path in config
        dataset_path = Path(tempdir).joinpath('data')
        if not dataset_path.exists():
            dataset_path.mkdir()
        dataset_path_str = str(dataset_path)
        config['data']['train']['paths'] = [dataset_path_str]
        config['data']['eval']['paths'] = [dataset_path_str]
        config['data']['infer']['paths'] = [dataset_path_str]

        # generate dummy data
        feat_dim = config['data']['task']['audio']['feature_size']
        kaldi_dir_utils.gen_dummy_data_dir(dataset_path_str,
                                           2,
                                           2,
                                           feat_dim=feat_dim)

        solver_name = config['solver']['name']
        self.solver = registers.solver[solver_name](config)

        # config after process
        self.config = self.solver.config
Exemple #8
0
 def setUp(self):
   main_root = os.environ['MAIN_ROOT']
   main_root = Path(main_root)
   self.config_file = main_root.joinpath('egs/mock_text_seq_label_data/config/seq-label-mock.yml')
   self.config = utils.load_config(self.config_file)
   import_all_modules_for_register()
  def setUp(self):
    ''' set up'''
    self.conf_str = '''
    data:
      train:
        paths:
        - null 
        segments: null
      eval:
        paths:
        - null
        segments: null
      infer:
        paths:
        - null 
        segments: null
      task:
        name: SpeechClsTask
        suffix: .npy # file suffix
        audio:
          dry_run: false # not save feat
          # params
          clip_size: 30 # clip len in seconds
          stride: 0.5 # stride in ratio of clip_size
          sr: 8000 # sample rate
          winlen: 0.025 # window len
          winstep: 0.01 # window stride
          nfft: 512 # fft bins, default: 512
          lowfreq: 0
          highfreq: null # default: null, 200 points for 800 nfft, 400 points for 1600 nfft
          preemph: 0.97 # default: 0.97
          # extractor
          feature_extractor: tffeat # `tffeat` to use TF feature_extraction .so library, 'pyfeat' to python_speech_feature
          feature_name: fbank # fbank or spec
          save_feat_path: null  # null for dump feat with same dir of wavs
          feature_size: 40 # extract feature size
          add_delta_deltas: true # delta deltas
          # log pwoer
          log_powspec: false # true, save log power spec; otherwise save power spec
          # cmvn
          cmvn: true # apply cmvn or generate cmvn
          cmvn_path: ./cmvn_conflict.npy # cmvn file
        text:
          enable: False
          vocab_path: /vocab/chars5004_attention.txt
          vocab_size: 5004 # vocab size
          max_text_len: 100 # max length for text
        classes:
          num: 2
          vocab:
            normal: 0
            conflict: 1
        num_parallel_calls: 12
        num_prefetch_batch: 2
        shuffle_buffer_size: 200000
        need_shuffle: true
    solver:
      name: EmotionSolver
      optimizer:
        name: adam
        epochs: 5 # maximum epochs
        batch_size: 32 # number of elements in a training batch
        loss: CrossEntropyLoss
        label_smoothing: 0.0 # label smoothing rate
        learning_rate:
          rate: 0.0001 # learning rate of Adam optimizer
          type:  exp_decay # learning rate type
          decay_rate: 0.99  # the lr decay rate
          decay_steps: 100  # the lr decay_step for optimizer
        clip_global_norm: 3.0 # clip global norm
        multitask: False # whether is multi-task
      metrics:
        pos_label: 1 # int, same to sklearn
        cals:
        - name: AccuracyCal
          arguments: null 
        - name: ConfusionMatrixCal
          arguments: null
        - name: PrecisionCal
          arguments:
            average: 'binary'
        - name: RecallCal
          arguments:
            average: 'binary'
        - name: F1ScoreCal
          arguments:
            average: 'binary'
      saver:
        model_path: "ckpt/emotion-speech-cls/test"
        max_to_keep: 10
        save_checkpoints_steps: 100
        keep_checkpoint_every_n_hours: 10000
        checkpoint_every: 100 # the step to save checkpoint
        summary: false
        save_summary_steps: 100
        eval_on_dev_every_secs: 1
        print_every: 10
        resume_model_path: ""
    '''
    import_all_modules_for_register()
    #tempdir = tempfile.mkdtemp()
    tempdir = self.get_temp_dir()

    config_path = str(Path(tempdir).joinpath("speech_task.yaml"))
    logging.info("config path: {}".format(config_path))
    with open(config_path, 'w', encoding='utf-8') as f:  #pylint: disable=invalid-name
      f.write(self.conf_str)

    dataset_path = Path(tempdir).joinpath("data")
    if not dataset_path.exists():
      dataset_path.mkdir()
    postive_path = dataset_path.joinpath("conflict")
    if not postive_path.exists():
      postive_path.mkdir()
    negtive_path = dataset_path.joinpath("normal")
    if not negtive_path.exists():
      negtive_path.mkdir()

    wav_path = Path(os.environ['MAIN_ROOT']).joinpath(
        'delta/data/feat/python_speech_features/english.wav')
    for i in range(10):
      pos_file = postive_path.joinpath("{}.wav".format(i))
      neg_file = negtive_path.joinpath("{}.wav".format(i))
      shutil.copyfile(str(wav_path), str(pos_file))
      shutil.copyfile(str(wav_path), str(neg_file))

    config = utils.load_config(config_path)
    config['data']['train']['paths'] = [str(dataset_path)]
    config['data']['eval']['paths'] = [str(dataset_path)]
    config['data']['infer']['paths'] = [str(dataset_path)]
    logging.info("config: {}".format(config))

    solver_name = config['solver']['name']
    self.solver = registers.solver[solver_name](config)

    # config after process
    self.config = self.solver.config

    task_name = self.config['data']['task']['name']
    self.task_class = registers.task[task_name]
Exemple #10
0
  def test_english(self):
    """ test seq to seq task of chiniese data, split sentences by space"""

    config = utils.load_config(self.config_file)
    max_len = config["model"]["net"]["structure"]["max_enc_len"]
    data_config = config["data"]
    task_config = data_config["task"]
    task_config["language"] = "english"
    task_config["split_by_space"] = False
    task_config["use_word"] = True

    # test offline data for 'train'

    task = TextS2STask(config, utils.TRAIN)
    data = task.dataset()
    self.assertTrue("input_x_dict" in data and
                    "input_enc_x" in data["input_x_dict"] and
                    "input_dec_x" in data["input_x_dict"])
    self.assertTrue("input_y_dict" in data and
                    "input_y" in data["input_y_dict"])
    with self.cached_session(use_gpu=False, force_gpu=False) as sess:
      sess.run(data["iterator"].initializer)
      res = sess.run([
          data["input_x_dict"]["input_enc_x"],
          data["input_x_dict"]["input_dec_x"], data["input_y_dict"]["input_y"],
          data["input_x_len"]
      ])

      logging.debug(res[0][0])
      logging.debug(res[1][0])
      logging.debug(res[2][0])
      logging.debug(res[3])

      self.assertEqual(np.shape(res[0])[0], 16)
      self.assertEqual(np.shape(res[1])[0], 16)
      self.assertEqual(np.shape(res[2])[0], 16)
      self.assertEqual(np.shape(res[3])[0], 16)

    # test offline data for 'infer'
    task = TextS2STask(config, utils.INFER)
    task.infer_without_label = True
    data = task.dataset()
    self.assertTrue("input_x_dict" in data and
                    "input_enc_x" in data["input_x_dict"])
    with self.cached_session(use_gpu=False, force_gpu=False) as sess:
      sess.run(data["iterator"].initializer)
      res = sess.run([data["input_x_dict"]["input_enc_x"], data["input_x_len"]])

      logging.debug(res[0][0])
      logging.debug(res[1][0])

      self.assertEqual(np.shape(res[0])[0], 16)
      self.assertEqual(np.shape(res[1])[0], 16)

    # test online data
    export_inputs = task.export_inputs()
    self.assertTrue("export_inputs" in export_inputs and
                    "input_sentence" in export_inputs["export_inputs"])
    input_sentence = export_inputs["export_inputs"]["input_sentence"]
    input_x = export_inputs["model_inputs"]["input_enc_x"]

    with self.cached_session(use_gpu=False, force_gpu=False) as sess:
      sess.run(data["iterator"].initializer)
      res = sess.run(
          input_x,
          feed_dict={
              input_sentence: [
                  " vice president walter "
                  "mondale was released"
              ]
          })
      logging.debug(res[0][:5])
      logging.debug(np.shape(res[0]))
      self.assertEqual(np.shape(res[0]), (max_len,))
Exemple #11
0
 def test_valid_config(self):
     ''' valid config unittest '''
     utils.save_config(self.conf_true, self.conf_file)
     conf = utils.load_config(self.conf_file)
     self.assertEqual(utils.valid_config(conf), True)
Exemple #12
0
 def test_save_config(self):
     ''' save config unittest '''
     utils.save_config(self.conf_true, self.conf_file)
     conf = utils.load_config(self.conf_file)
     self.assertDictEqual(conf, self.conf_true)