Esempio n. 1
0
 def __init__(self, istream_handle, squared = True, window_size = 256):
     self.window_size = window_size
     self.shift_dist = window_size/2
     self.batch_size = 100
     self.squared = squared
     buff_size = self.window_size + (self.batch_size - 1) * self.shift_dist
     super(spectrum, self).__init__(istream_handle, buff_size, self.window_size - self.shift_dist, (), istream_handle.stream.data_format['dtype'])
     
     if squared:        
         self.ostream = data_stream(istream_handle.stream.sample_rate/float(self.shift_dist), data_format = {'shape':(self.window_size/2,), 'dtype':np.double})
     else:
         self.ostream = data_stream(istream_handle.stream.sample_rate/float(self.shift_dist), data_format = {'shape':(self.window_size/2,), 'dtype':np.complex128})
     self.w = np.hamming(self.window_size)
Esempio n. 2
0
 def __init__(self, file_name):
     super(audio_decoder, self).__init__()
     self.file_name = file_name
     ext = file_name.strip().split('.')[-1].lower()
     dm = muxer.Demuxer(ext)
     fin = open(file_name, 'rb')
     if not fin:
         raise "cannot find file %s" % file_name
     s = fin.read(3000000)
     r = dm.parse(s)
     
     print dm.streams
     self.decoder = None
     for aindex in xrange( len( dm.streams )):
       if dm.streams[ aindex ] and dm.streams[ aindex ][ 'type' ]== muxer.CODEC_TYPE_AUDIO:
         self.decoder = acodec.Decoder( dm.streams[ aindex ] )
         self.aindex = aindex
         break
     if not self.decoder:
         raise "no audio track found in given media file!"
     
     self.resampler = sound.Resampler( (dm.streams[ aindex ][ 'sample_rate' ], dm.streams[ aindex ][ 'channels' ]), 
                                       (constants.AUDIO_SAMPLE_RATE , 1) )
     self.ostream = data_stream(constants.AUDIO_SAMPLE_RATE, data_format = {'dtype':np.int16})
     self.odtype = np.int16
     self.demuxer = dm
     self.frames = r
     self.fin = fin
Esempio n. 3
0
 def __init__(self, file_name, output_rate=constants.AUDIO_SAMPLE_RATE):
     super(ffmpeg_decoder, self).__init__()
     self.file_name = file_name
     self.ostream = data_stream(output_rate,
                                data_format={'dtype': np.int16})
     self.odtype = np.int16
     self.output_rate = output_rate
Esempio n. 4
0
 def __init__(self, istream_handle):
     self.window_size = 256
     self.shift_dist = 128
     self.batch_size = 100
     buff_size = self.window_size + (self.batch_size - 1) * self.shift_dist
     super(spectrum, self).__init__(istream_handle, buff_size, self.window_size - self.shift_dist, (), istream_handle.stream.data_format['dtype'])
     
     self.ostream = data_stream(constants.AUDIO_SAMPLE_RATE/float(self.shift_dist), data_format = {'shape':(self.window_size/2,), 'dtype':np.double})
     self.w = scipy.hamming(self.window_size)
Esempio n. 5
0
    def __init__(self, istream_handle):
        self.input_format = istream_handle.stream.data_format
        self.input_size = self.input_format['shape'][0]
        super(naive_vad_score, self).__init__(istream_handle, 100, 0,
                                              self.input_format['shape'],
                                              self.input_format['dtype'])

        self.ostream = data_stream(
            istream_handle.stream.sample_rate,
            data_format={'dtype': self.input_format['dtype']})
Esempio n. 6
0
def main(unused_argv):
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
        level=logging.DEBUG if FLAGS.debug else logging.INFO)

    (train_xs, train_ys), (test_xs, test_ys), (dev_xs, dev_ys) \
        = load_bibtex('bibtex-train.arff', 'bibtex-test.arff')

    _, init_params = init_mlp(npr.PRNGKey(FLAGS.random_seed),
                              (FLAGS.batch_size, INPUTS))

    opt_init, opt_update, get_params = adam(0.001)
    # opt_init, opt_update, get_params = momentum(0.001, 0.9)
    # opt_init, opt_update, get_params = sgd(0.001)
    opt_state = opt_init(init_params)

    @jit
    def update(i, opt_state, batch):
        params = get_params(opt_state)
        loss, g = value_and_grad(cross_entropy_loss)(params, *batch)
        return opt_update(i, g, opt_state), loss

    num_batches = int(onp.ceil(len(train_xs) / FLAGS.batch_size))
    train_stream = data_stream(train_xs,
                               train_ys,
                               batch_size=FLAGS.batch_size,
                               random_seed=FLAGS.random_seed,
                               infty=True)
    itercount = itertools.count()
    best_f1 = 0.
    for epoch in range(FLAGS.epochs):
        step_loss = 0.
        for _ in tqdm(range(num_batches)):
            opt_state, loss = update(next(itercount), opt_state,
                                     next(train_stream))
            step_loss += loss
        logger.info(f'epoch: {epoch} loss = {step_loss / num_batches}')
        f1 = evaluate(get_params(opt_state),
                      inference,
                      test_xs,
                      test_ys,
                      batch_size=FLAGS.batch_size,
                      threshold=0.5)
        if f1 > best_f1:
            best_f1 = f1
    logger.info(f'best F1 score = {best_f1}')
Esempio n. 7
0
 def __init__(self, istream_handle):
     super(naive_vad_decision, self).__init__(istream_handle, 500, 0, (129,), istream_handle.stream.data_format['dtype'])
     self.ostream = data_stream(float('inf'))        
     self.Ts = None
     self.init_len = min(10, self.buff_size)
     self.mu = 0
     self.sigma = 0
     self.H = None
     self.alpha = 0.5
     self.beta = 0.99
     self.speech = False
     self.max_sep = int(0.2*istream_handle.stream.sample_rate)
     self.sep = 0
     self.min_seg = int(0.08*istream_handle.stream.sample_rate)
     self.start_point = None
     
     
     self.fout = open('demo2.srt','w')
     self.count = 0
Esempio n. 8
0
    def __init__(self, istream_handle):
        self.Fs = istream_handle.stream.sample_rate
        self.segment_len = 2000
        self.segment_reserve = 40
        self.low_freq = 20
        self.high_freq = 2
        self.frame_len = 10
        self.frame_shift = 10
        self.min_segment_len = 75
        self.breath_len = 20
        self.max_word_pause = 400
        self.max_clust_try = 100
        self.min_syllable_len = 100

        self.b = 3.5
        self.lambda_P = 1.0
        self.kmeans_error_threshold = 0.3
        self.preset_SNR = 10
        self.iframe_per_segment = (self.segment_len -
                                   self.frame_len) / self.frame_shift + 1
        self.isample_per_frame = self.frame_len * self.Fs / 1000

        self.MCR_low = 0.03
        self.MCR_high = 0.4
        self.bcheck_prosody = False

        self.last_fragment = []
        self.bcircle_done = False
        self.istate = 1
        self.A = [-2147483648 for i in xrange(4 + 1)]
        self.peak_E = -1.0

        self.segment_E = np.zeros(self.iframe_per_segment, dtype=np.float)
        self.bin_speech = False
        self.icur_frame = 0

        self.start_end = []

        super(naive_vad,
              self).__init__(istream_handle,
                             self.isample_per_frame * self.iframe_per_segment,
                             0, (), istream_handle.stream.data_format['dtype'])
        self.ostream = data_stream(0)
Esempio n. 9
0
    def __init__(self, istream_handle):
        super(naive_vad_decision,
              self).__init__(istream_handle, 500, 0, (129, ),
                             istream_handle.stream.data_format['dtype'])
        self.ostream = data_stream(float('inf'))
        self.Ts = None
        self.init_len = min(10, self.buff_size)
        self.mu = 0
        self.sigma = 0
        self.H = None
        self.alpha = 0.5
        self.beta = 0.99
        self.speech = False
        self.max_sep = int(0.2 * istream_handle.stream.sample_rate)
        self.sep = 0
        self.min_seg = int(0.08 * istream_handle.stream.sample_rate)
        self.start_point = None

        self.fout = open('demo2.srt', 'w')
        self.count = 0
Esempio n. 10
0
 def __init__(self, istream_handle):
     self.Fs = istream_handle.stream.sample_rate
     self.segment_len = 2000
     self.segment_reserve = 40
     self.low_freq = 20
     self.high_freq = 2
     self.frame_len = 10
     self.frame_shift = 10
     self.min_segment_len = 75
     self.breath_len = 20
     self.max_word_pause = 400
     self.max_clust_try = 100
     self.min_syllable_len = 100
     
     self.b = 3.5
     self.lambda_P = 1.0
     self.kmeans_error_threshold = 0.3
     self.preset_SNR = 10
     self.iframe_per_segment = (self.segment_len - self.frame_len)/self.frame_shift + 1
     self.isample_per_frame = self.frame_len * self.Fs / 1000
     
     self.MCR_low = 0.03
     self.MCR_high = 0.4
     self.bcheck_prosody = False
     
     self.last_fragment = []
     self.bcircle_done = False
     self.istate = 1
     self.A = [ -2147483648 for i in xrange(4+1)]
     self.peak_E = -1.0
     
     self.segment_E = np.zeros(self.iframe_per_segment, dtype = np.float)
     self.bin_speech = False
     self.icur_frame = 0
     
     self.start_end = []
     
     super(naive_vad, self).__init__(istream_handle, self.isample_per_frame*self.iframe_per_segment, 0, (), istream_handle.stream.data_format['dtype'])
     self.ostream = data_stream(0)        
Esempio n. 11
0
 def __init__(self, istream_handle):
     self.input_format = istream_handle.stream.data_format
     self.input_size = self.input_format['shape'][0]
     super(naive_vad_score, self).__init__(istream_handle, 100, 0, self.input_format['shape'], self.input_format['dtype'])
     
     self.ostream = data_stream(istream_handle.stream.sample_rate, data_format = {'dtype':self.input_format['dtype']})
Esempio n. 12
0
 def __init__(self, file_name, output_rate = constants.AUDIO_SAMPLE_RATE):
     super(ffmpeg_decoder, self).__init__()
     self.file_name = file_name
     self.ostream = data_stream(output_rate, data_format = {'dtype':np.int16})
     self.odtype = np.int16
     self.output_rate = output_rate
Esempio n. 13
0
def main(unused_argv):
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
        level=logging.DEBUG if FLAGS.debug else logging.INFO)

    (train_xs, train_ys), (test_xs, test_ys), (dev_xs, dev_ys) \
        = load_bibtex('bibtex-train.arff', 'bibtex-test.arff')

    init_params = init_param(npr.PRNGKey(FLAGS.random_seed),
                             input_units=INPUTS,
                             label_size=LABELS,
                             feature_size=FLAGS.feature_size,
                             label_units=FLAGS.label_units,
                             hidden_units=FLAGS.hidden_units)

    @jit
    def update_pretrain(i, opt_state, batch):
        params = get_params(opt_state)
        loss, g = value_and_grad(pretrain_loss)(params, *batch)
        return opt_update(i, g, opt_state), loss

    def update_ssvm(i, opt_state, batch, pretrain_global_energy=False):
        params = get_params(opt_state)
        loss, g = value_and_grad(ssvm_loss)(
            params, *batch, pretrain_global_energy=pretrain_global_energy)
        return opt_update(i, g, opt_state), loss

    stages = [
        Config(batch_size=FLAGS.pretrain_batch_size,
               epochs=FLAGS.pretrain_epoch,
               update_fun=update_pretrain,
               inference_fun=inference_pretrained,
               optimizer=momentum(0.001, 0.95),
               msg='pretraining feature network'),
        Config(batch_size=FLAGS.ssvm_batch_size,
               epochs=FLAGS.energy_pretrain_epoch,
               update_fun=partial(update_ssvm, pretrain_global_energy=True),
               inference_fun=inference,
               optimizer=momentum(0.001, 0.95),
               msg='pretraining energy network'),
        Config(batch_size=FLAGS.ssvm_batch_size,
               epochs=FLAGS.e2e_train_epoch,
               update_fun=update_ssvm,
               inference_fun=inference,
               optimizer=momentum(0.001, 0.95),
               msg='finetune the entire network end-to-end')
    ]
    best_f1 = 0.
    params = init_params
    for stage in stages:
        opt_init, opt_update, get_params = stage.optimizer
        opt_state = opt_init(params)
        logger.info(stage.msg)
        num_batches = int(onp.ceil(len(train_xs) / stage.batch_size))
        train_stream = data_stream(train_xs,
                                   train_ys,
                                   batch_size=stage.batch_size,
                                   random_seed=FLAGS.random_seed,
                                   infty=True)
        itercount = itertools.count()
        for epoch in range(stage.epochs):
            step_loss = 0.
            for _ in tqdm(range(num_batches)):
                opt_state, loss = stage.update_fun(next(itercount), opt_state,
                                                   next(train_stream))
                step_loss += loss
            logger.info(f'epoch: {epoch} loss = {step_loss / num_batches}')
            f1 = evaluate(get_params(opt_state),
                          stage.inference_fun,
                          test_xs,
                          test_ys,
                          batch_size=stage.batch_size)
            if f1 > best_f1:
                best_f1 = f1
        params = get_params(opt_state)
    logger.info(f'best F1 score = {best_f1}')