def __init__(self, istream_handle, squared = True, window_size = 256): self.window_size = window_size self.shift_dist = window_size/2 self.batch_size = 100 self.squared = squared buff_size = self.window_size + (self.batch_size - 1) * self.shift_dist super(spectrum, self).__init__(istream_handle, buff_size, self.window_size - self.shift_dist, (), istream_handle.stream.data_format['dtype']) if squared: self.ostream = data_stream(istream_handle.stream.sample_rate/float(self.shift_dist), data_format = {'shape':(self.window_size/2,), 'dtype':np.double}) else: self.ostream = data_stream(istream_handle.stream.sample_rate/float(self.shift_dist), data_format = {'shape':(self.window_size/2,), 'dtype':np.complex128}) self.w = np.hamming(self.window_size)
def __init__(self, file_name): super(audio_decoder, self).__init__() self.file_name = file_name ext = file_name.strip().split('.')[-1].lower() dm = muxer.Demuxer(ext) fin = open(file_name, 'rb') if not fin: raise "cannot find file %s" % file_name s = fin.read(3000000) r = dm.parse(s) print dm.streams self.decoder = None for aindex in xrange( len( dm.streams )): if dm.streams[ aindex ] and dm.streams[ aindex ][ 'type' ]== muxer.CODEC_TYPE_AUDIO: self.decoder = acodec.Decoder( dm.streams[ aindex ] ) self.aindex = aindex break if not self.decoder: raise "no audio track found in given media file!" self.resampler = sound.Resampler( (dm.streams[ aindex ][ 'sample_rate' ], dm.streams[ aindex ][ 'channels' ]), (constants.AUDIO_SAMPLE_RATE , 1) ) self.ostream = data_stream(constants.AUDIO_SAMPLE_RATE, data_format = {'dtype':np.int16}) self.odtype = np.int16 self.demuxer = dm self.frames = r self.fin = fin
def __init__(self, file_name, output_rate=constants.AUDIO_SAMPLE_RATE): super(ffmpeg_decoder, self).__init__() self.file_name = file_name self.ostream = data_stream(output_rate, data_format={'dtype': np.int16}) self.odtype = np.int16 self.output_rate = output_rate
def __init__(self, istream_handle): self.window_size = 256 self.shift_dist = 128 self.batch_size = 100 buff_size = self.window_size + (self.batch_size - 1) * self.shift_dist super(spectrum, self).__init__(istream_handle, buff_size, self.window_size - self.shift_dist, (), istream_handle.stream.data_format['dtype']) self.ostream = data_stream(constants.AUDIO_SAMPLE_RATE/float(self.shift_dist), data_format = {'shape':(self.window_size/2,), 'dtype':np.double}) self.w = scipy.hamming(self.window_size)
def __init__(self, istream_handle): self.input_format = istream_handle.stream.data_format self.input_size = self.input_format['shape'][0] super(naive_vad_score, self).__init__(istream_handle, 100, 0, self.input_format['shape'], self.input_format['dtype']) self.ostream = data_stream( istream_handle.stream.sample_rate, data_format={'dtype': self.input_format['dtype']})
def main(unused_argv): logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', level=logging.DEBUG if FLAGS.debug else logging.INFO) (train_xs, train_ys), (test_xs, test_ys), (dev_xs, dev_ys) \ = load_bibtex('bibtex-train.arff', 'bibtex-test.arff') _, init_params = init_mlp(npr.PRNGKey(FLAGS.random_seed), (FLAGS.batch_size, INPUTS)) opt_init, opt_update, get_params = adam(0.001) # opt_init, opt_update, get_params = momentum(0.001, 0.9) # opt_init, opt_update, get_params = sgd(0.001) opt_state = opt_init(init_params) @jit def update(i, opt_state, batch): params = get_params(opt_state) loss, g = value_and_grad(cross_entropy_loss)(params, *batch) return opt_update(i, g, opt_state), loss num_batches = int(onp.ceil(len(train_xs) / FLAGS.batch_size)) train_stream = data_stream(train_xs, train_ys, batch_size=FLAGS.batch_size, random_seed=FLAGS.random_seed, infty=True) itercount = itertools.count() best_f1 = 0. for epoch in range(FLAGS.epochs): step_loss = 0. for _ in tqdm(range(num_batches)): opt_state, loss = update(next(itercount), opt_state, next(train_stream)) step_loss += loss logger.info(f'epoch: {epoch} loss = {step_loss / num_batches}') f1 = evaluate(get_params(opt_state), inference, test_xs, test_ys, batch_size=FLAGS.batch_size, threshold=0.5) if f1 > best_f1: best_f1 = f1 logger.info(f'best F1 score = {best_f1}')
def __init__(self, istream_handle): super(naive_vad_decision, self).__init__(istream_handle, 500, 0, (129,), istream_handle.stream.data_format['dtype']) self.ostream = data_stream(float('inf')) self.Ts = None self.init_len = min(10, self.buff_size) self.mu = 0 self.sigma = 0 self.H = None self.alpha = 0.5 self.beta = 0.99 self.speech = False self.max_sep = int(0.2*istream_handle.stream.sample_rate) self.sep = 0 self.min_seg = int(0.08*istream_handle.stream.sample_rate) self.start_point = None self.fout = open('demo2.srt','w') self.count = 0
def __init__(self, istream_handle): self.Fs = istream_handle.stream.sample_rate self.segment_len = 2000 self.segment_reserve = 40 self.low_freq = 20 self.high_freq = 2 self.frame_len = 10 self.frame_shift = 10 self.min_segment_len = 75 self.breath_len = 20 self.max_word_pause = 400 self.max_clust_try = 100 self.min_syllable_len = 100 self.b = 3.5 self.lambda_P = 1.0 self.kmeans_error_threshold = 0.3 self.preset_SNR = 10 self.iframe_per_segment = (self.segment_len - self.frame_len) / self.frame_shift + 1 self.isample_per_frame = self.frame_len * self.Fs / 1000 self.MCR_low = 0.03 self.MCR_high = 0.4 self.bcheck_prosody = False self.last_fragment = [] self.bcircle_done = False self.istate = 1 self.A = [-2147483648 for i in xrange(4 + 1)] self.peak_E = -1.0 self.segment_E = np.zeros(self.iframe_per_segment, dtype=np.float) self.bin_speech = False self.icur_frame = 0 self.start_end = [] super(naive_vad, self).__init__(istream_handle, self.isample_per_frame * self.iframe_per_segment, 0, (), istream_handle.stream.data_format['dtype']) self.ostream = data_stream(0)
def __init__(self, istream_handle): super(naive_vad_decision, self).__init__(istream_handle, 500, 0, (129, ), istream_handle.stream.data_format['dtype']) self.ostream = data_stream(float('inf')) self.Ts = None self.init_len = min(10, self.buff_size) self.mu = 0 self.sigma = 0 self.H = None self.alpha = 0.5 self.beta = 0.99 self.speech = False self.max_sep = int(0.2 * istream_handle.stream.sample_rate) self.sep = 0 self.min_seg = int(0.08 * istream_handle.stream.sample_rate) self.start_point = None self.fout = open('demo2.srt', 'w') self.count = 0
def __init__(self, istream_handle): self.Fs = istream_handle.stream.sample_rate self.segment_len = 2000 self.segment_reserve = 40 self.low_freq = 20 self.high_freq = 2 self.frame_len = 10 self.frame_shift = 10 self.min_segment_len = 75 self.breath_len = 20 self.max_word_pause = 400 self.max_clust_try = 100 self.min_syllable_len = 100 self.b = 3.5 self.lambda_P = 1.0 self.kmeans_error_threshold = 0.3 self.preset_SNR = 10 self.iframe_per_segment = (self.segment_len - self.frame_len)/self.frame_shift + 1 self.isample_per_frame = self.frame_len * self.Fs / 1000 self.MCR_low = 0.03 self.MCR_high = 0.4 self.bcheck_prosody = False self.last_fragment = [] self.bcircle_done = False self.istate = 1 self.A = [ -2147483648 for i in xrange(4+1)] self.peak_E = -1.0 self.segment_E = np.zeros(self.iframe_per_segment, dtype = np.float) self.bin_speech = False self.icur_frame = 0 self.start_end = [] super(naive_vad, self).__init__(istream_handle, self.isample_per_frame*self.iframe_per_segment, 0, (), istream_handle.stream.data_format['dtype']) self.ostream = data_stream(0)
def __init__(self, istream_handle): self.input_format = istream_handle.stream.data_format self.input_size = self.input_format['shape'][0] super(naive_vad_score, self).__init__(istream_handle, 100, 0, self.input_format['shape'], self.input_format['dtype']) self.ostream = data_stream(istream_handle.stream.sample_rate, data_format = {'dtype':self.input_format['dtype']})
def __init__(self, file_name, output_rate = constants.AUDIO_SAMPLE_RATE): super(ffmpeg_decoder, self).__init__() self.file_name = file_name self.ostream = data_stream(output_rate, data_format = {'dtype':np.int16}) self.odtype = np.int16 self.output_rate = output_rate
def main(unused_argv): logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', level=logging.DEBUG if FLAGS.debug else logging.INFO) (train_xs, train_ys), (test_xs, test_ys), (dev_xs, dev_ys) \ = load_bibtex('bibtex-train.arff', 'bibtex-test.arff') init_params = init_param(npr.PRNGKey(FLAGS.random_seed), input_units=INPUTS, label_size=LABELS, feature_size=FLAGS.feature_size, label_units=FLAGS.label_units, hidden_units=FLAGS.hidden_units) @jit def update_pretrain(i, opt_state, batch): params = get_params(opt_state) loss, g = value_and_grad(pretrain_loss)(params, *batch) return opt_update(i, g, opt_state), loss def update_ssvm(i, opt_state, batch, pretrain_global_energy=False): params = get_params(opt_state) loss, g = value_and_grad(ssvm_loss)( params, *batch, pretrain_global_energy=pretrain_global_energy) return opt_update(i, g, opt_state), loss stages = [ Config(batch_size=FLAGS.pretrain_batch_size, epochs=FLAGS.pretrain_epoch, update_fun=update_pretrain, inference_fun=inference_pretrained, optimizer=momentum(0.001, 0.95), msg='pretraining feature network'), Config(batch_size=FLAGS.ssvm_batch_size, epochs=FLAGS.energy_pretrain_epoch, update_fun=partial(update_ssvm, pretrain_global_energy=True), inference_fun=inference, optimizer=momentum(0.001, 0.95), msg='pretraining energy network'), Config(batch_size=FLAGS.ssvm_batch_size, epochs=FLAGS.e2e_train_epoch, update_fun=update_ssvm, inference_fun=inference, optimizer=momentum(0.001, 0.95), msg='finetune the entire network end-to-end') ] best_f1 = 0. params = init_params for stage in stages: opt_init, opt_update, get_params = stage.optimizer opt_state = opt_init(params) logger.info(stage.msg) num_batches = int(onp.ceil(len(train_xs) / stage.batch_size)) train_stream = data_stream(train_xs, train_ys, batch_size=stage.batch_size, random_seed=FLAGS.random_seed, infty=True) itercount = itertools.count() for epoch in range(stage.epochs): step_loss = 0. for _ in tqdm(range(num_batches)): opt_state, loss = stage.update_fun(next(itercount), opt_state, next(train_stream)) step_loss += loss logger.info(f'epoch: {epoch} loss = {step_loss / num_batches}') f1 = evaluate(get_params(opt_state), stage.inference_fun, test_xs, test_ys, batch_size=stage.batch_size) if f1 > best_f1: best_f1 = f1 params = get_params(opt_state) logger.info(f'best F1 score = {best_f1}')