def load_next_partition(self, shared_xy): print 'load_next_partition!' pfile_path = self.pfile_path_list[self.cur_pfile_index] if self.feat_mat is None or len(self.pfile_path_list) > 1: fopen = smart_open(pfile_path, 'rb') self.feat_mat, self.label_vec = cPickle.load(fopen) fopen.close() shared_x, shared_y = shared_xy #TODO no longer label_vec, is array self.feat_mat, self.label_vec = \ preprocess_feature_and_label(self.feat_mat, self.label_vec, self.read_opts) if self.read_opts['random']: shuffle_feature_and_label(self.feat_mat, self.label_vec) shared_x.set_value(self.feat_mat, borrow=True) #TODO types wrong here? Maybe? shared_y.set_value(self.label_vec.astype(theano.config.floatX), borrow=True) self.cur_frame_num = len(self.feat_mat) print len(self.feat_mat), len(self.label_vec), self.feat_mat.shape self.cur_pfile_index += 1 if self.cur_pfile_index >= len(self.pfile_path_list): # the end of one epoch self.end_reading = True self.cur_pfile_index = 0
def load_next_partition(self, shared_xy): pfile_path = self.pfile_path_list[self.cur_pfile_index] if self.feat_mat is None or len(self.pfile_path_list) > 1: fopen = smart_open(pfile_path, 'rb') test = cPickle.load(fopen) if len(test) == 2: self.feat_mat, self.label_vec = test elif len(test) == 3: self.feat_mat, self.label_vec, unused = test fopen.close() shared_x, shared_y = shared_xy self.feat_mat, self.label_vec = \ preprocess_feature_and_label(self.feat_mat, self.label_vec, self.read_opts) if self.read_opts['random']: shuffle_feature_and_label(self.feat_mat, self.label_vec) shared_x.set_value(self.feat_mat, borrow=True) shared_y.set_value(self.label_vec, borrow=True) self.cur_frame_num = len(self.feat_mat) self.cur_pfile_index += 1 if self.cur_pfile_index >= len( self.pfile_path_list): # the end of one epoch self.end_reading = True self.cur_pfile_index = 0
def load_next_partition(self, shared_xy): pfile_path = self.pfile_path_list[self.cur_pfile_index] print("pfile_path:", pfile_path) if self.feat_mat is None or len(self.pfile_path_list) > 1: fopen = smart_open(pfile_path, "rb") test = cPickle.load(fopen) if len(test) == 2: self.feat_mat, self.label_vec = test elif len(test) == 3: self.feat_mat, self.label_vec, unused = test fopen.close() shared_x, shared_y = shared_xy self.feat_mat, self.label_vec = preprocess_feature_and_label(self.feat_mat, self.label_vec, self.read_opts) if self.read_opts["random"]: shuffle_feature_and_label(self.feat_mat, self.label_vec) shared_x.set_value(self.feat_mat, borrow=True) shared_y.set_value(self.label_vec.astype(theano.config.floatX), borrow=True) self.cur_frame_num = len(self.feat_mat) print("self.cur_frame_num is;", self.cur_frame_num) self.cur_pfile_index += 1 if self.cur_pfile_index >= len(self.pfile_path_list): # the end of one epoch self.end_reading = True self.cur_pfile_index = 0
def load_next_partition(self, shared_xy): shared_x, shared_y = shared_xy if self.feat_buffer is None: read_frame_num = 0 else: # An utterance hasn't been completely consumed yet read_frame_num = min(self.max_frame_num, len(self.feat_buffer)) self.feats[0:read_frame_num] = self.feat_buffer[0:read_frame_num] if self.ali_provided: self.labels[0:read_frame_num] = self.label_buffer[0:read_frame_num] if read_frame_num == len(self.feat_buffer): self.feat_buffer = None self.label_buffer = None else: self.feat_buffer = self.feat_buffer[read_frame_num:] if self.ali_provided: self.label_buffer = self.label_buffer[read_frame_num:] while read_frame_num < self.max_frame_num: utt_id, utt_mat = self.read_next_utt() if utt_id == '': # No more utterances available self.end_reading = True break if self.ali_provided and (self.alignment.has_key(utt_id) is False): continue rows = len(utt_mat) if self.ali_provided: ali_utt = self.alignment[utt_id] if len(ali_utt) != rows: continue else: ali_utt = None utt_mat, ali_utt = preprocess_feature_and_label(utt_mat, ali_utt, self.read_opts) rows = len(utt_mat) if read_frame_num + rows > self.max_frame_num: # Utterance won't fit in current partition, use some frames and keep the rest for the next partition rows = self.max_frame_num - read_frame_num self.feat_buffer = utt_mat[rows:] utt_mat = utt_mat[:rows] if self.ali_provided: self.label_buffer = ali_utt[rows:] ali_utt = ali_utt[:rows] self.feats[read_frame_num:(read_frame_num + rows)] = utt_mat if self.ali_provided: self.labels[read_frame_num:(read_frame_num + rows)] = ali_utt read_frame_num += rows if self.read_opts['random']: shuffle_feature_and_label(self.feats[0:read_frame_num], self.labels[0:read_frame_num]) shared_x.set_value(self.feats[0:read_frame_num], borrow=True) if self.ali_provided: shared_y.set_value(self.labels[0:read_frame_num], borrow=True) self.cur_frame_num = read_frame_num
def load_next_partition(self, shared_xy): shared_x, shared_y = shared_xy # read one partition from disk; data format for pfile reading # d -- features; l -- label self.dtype = numpy.dtype({'names': ['d', 'l'], 'formats': [('>f', self.original_feat_dim), '>i'], 'offsets': [self.feat_start_column * 4, self.label_start_column * 4]}) while len(self.feat_buffer) < self.frame_per_partition and self.sentence_index < self.num_sentences: num_frames = self.sentence_offset[self.sentence_index + 1] - self.sentence_offset[self.sentence_index] if self.file_read is file: # Not a compressed file sentence_array = numpy.fromfile(self.file_read, self.dtype, num_frames) else: nbytes = 4 * num_frames * (self.label_start_column + self.num_labels) d_tmp = self.file_read.read(nbytes) sentence_array = numpy.fromstring(d_tmp, self.dtype, num_frames) feat_mat = numpy.asarray(sentence_array['d']) label_vec = numpy.asarray(sentence_array['l']) feat_mat, label_vec = preprocess_feature_and_label(feat_mat, label_vec, self.read_opts) self.feat_buffer = numpy.concatenate((self.feat_buffer, feat_mat)) self.label_buffer = numpy.concatenate((self.label_buffer, label_vec)) self.sentence_index += 1 self.feat = self.feat_buffer[:self.frame_per_partition].astype(theano.config.floatX) self.label = self.label_buffer[:self.frame_per_partition].astype(theano.config.floatX) self.feat_buffer = self.feat_buffer[self.frame_per_partition:] self.label_buffer = self.label_buffer[self.frame_per_partition:] self.cur_frame_num = len(self.feat) if self.read_opts['random']: shuffle_feature_and_label(self.feat, self.label) shared_x.set_value(self.feat, borrow = True) shared_y.set_value(self.label, borrow = True) if self.sentence_index >= self.num_sentences and len(self.feat_buffer) == 0: # move on to the next pfile self.cur_pfile_index += 1 if self.cur_pfile_index >= len(self.pfile_path_list): self.end_reading = True self.cur_pfile_index = 0 else: self.initialize_read()
def load_next_partition(self, shared_xy): pfile_path = self.pfile_path_list[self.cur_pfile_index] if self.feat_mat is None or len(self.pfile_path_list) > 1: fopen = smart_open(pfile_path, 'rb') self.feat_mat, self.label_vec = load(fopen) fopen.close() shared_x, shared_y = shared_xy self.feat_mat, self.label_vec = \ preprocess_feature_and_label(self.feat_mat, self.label_vec, self.read_opts) if self.read_opts['random']: shuffle_feature_and_label(self.feat_mat, self.label_vec) shared_x.set_value(self.feat_mat, borrow=True) shared_y.set_value(self.label_vec.astype(theano.config.floatX), borrow=True) self.cur_frame_num = len(self.feat_mat) self.cur_pfile_index += 1 if self.cur_pfile_index >= len(self.pfile_path_list): # the end of one epoch self.end_reading = True self.cur_pfile_index = 0
def load_next_partition(self, shared_xy): shared_x, shared_y = shared_xy read_frame_num = 0 while True: utt_id, utt_mat = self.read_next_utt() if utt_id == '': self.end_reading = True break if self.ali_provided and (self.alignment.has_key(utt_id) is False): continue rows = utt_mat.shape[0] if self.ali_provided: ali_utt = self.alignment[utt_id] if ali_utt.shape[0] != rows: continue else: ali_utt = None utt_mat, ali_utt = preprocess_feature_and_label(utt_mat, ali_utt, self.read_opts) rows = utt_mat.shape[0] if read_frame_num + rows > self.max_frame_num: self.scp_file_read.seek(self.scp_cur_pos) break self.feats[read_frame_num:(read_frame_num+rows)] = utt_mat if self.ali_provided: self.labels[read_frame_num:(read_frame_num+rows)] = ali_utt read_frame_num += rows if self.read_opts['random']: shuffle_feature_and_label(self.feats[0:read_frame_num], self.labels[0:read_frame_num]) shared_x.set_value(self.feats[0:read_frame_num], borrow=True) if self.ali_provided: shared_y.set_value(self.labels[0:read_frame_num], borrow=True) self.cur_frame_num = read_frame_num
def load_next_partition(self, shared_xy): pfile_path = self.pfile_path_list[self.cur_pfile_index] if self.feat_mat is None or len(self.pfile_path_list) > 1: #log("Start reading partition "+pfile_path) self.feat_mat = bp.unpack_ndarray_file(pfile_path) self.label_vec = bp.unpack_ndarray_file(pfile_path+".labels") shared_x, shared_y = shared_xy self.feat_mat, self.label_vec = \ preprocess_feature_and_label(self.feat_mat, self.label_vec, self.read_opts) if self.read_opts['random']: shuffle_feature_and_label(self.feat_mat, self.label_vec) shared_x.set_value(self.feat_mat, borrow=True) shared_y.set_value(self.label_vec.astype(theano.config.floatX), borrow=True) #log("Finished reading partition "+pfile_path) self.cur_frame_num = len(self.feat_mat) self.cur_pfile_index += 1 if self.cur_pfile_index >= len(self.pfile_path_list): # the end of one epoch self.end_reading = True self.cur_pfile_index = 0 return pfile_path
def load_next_partition(self, shared_xy): shared_x, shared_y = shared_xy if self.feat_buffer is None: read_frame_num = 0 else: # An utterance hasn't been completely consumed yet read_frame_num = min(self.max_frame_num, len(self.feat_buffer)) self.feats[0:read_frame_num] = self.feat_buffer[0:read_frame_num] if self.ali_provided: self.labels[0:read_frame_num] = self.label_buffer[ 0:read_frame_num] if read_frame_num == len(self.feat_buffer): self.feat_buffer = None self.label_buffer = None else: self.feat_buffer = self.feat_buffer[read_frame_num:] if self.ali_provided: self.label_buffer = self.label_buffer[read_frame_num:] while read_frame_num < self.max_frame_num: utt_id, utt_mat = self.read_next_utt() if utt_id == '': # No more utterances available self.end_reading = True break if self.ali_provided and (self.alignment.has_key(utt_id) is False): continue rows = len(utt_mat) if self.ali_provided: ali_utt = self.alignment[utt_id] if len(ali_utt) != rows: continue else: ali_utt = None utt_mat, ali_utt = preprocess_feature_and_label( utt_mat, ali_utt, self.read_opts) rows = len(utt_mat) if read_frame_num + rows > self.max_frame_num: # Utterance won't fit in current partition, use some frames and keep the rest for the next partition rows = self.max_frame_num - read_frame_num self.feat_buffer = utt_mat[rows:] utt_mat = utt_mat[:rows] if self.ali_provided: self.label_buffer = ali_utt[rows:] ali_utt = ali_utt[:rows] self.feats[read_frame_num:(read_frame_num + rows)] = utt_mat if self.ali_provided: self.labels[read_frame_num:(read_frame_num + rows)] = ali_utt read_frame_num += rows if self.read_opts['random']: shuffle_feature_and_label(self.feats[0:read_frame_num], self.labels[0:read_frame_num]) shared_x.set_value(self.feats[0:read_frame_num], borrow=True) if self.ali_provided: shared_y.set_value(self.labels[0:read_frame_num], borrow=True) self.cur_frame_num = read_frame_num
def load_next_partition(self, shared_xy): shared_x, shared_y = shared_xy # read one partition from disk; data format for pfile reading # d -- features; l -- label self.dtype = numpy.dtype({'names': ['d', 'l'], 'formats': [('>f', self.original_feat_dim), '>i'], 'offsets': [self.feat_start_column * 4, self.label_start_column * 4]}) if self.feat_buffer is None: read_frames = 0 else: # An utterance hasn't been completely consumed yet read_frames = min(self.frame_per_partition, len(self.feat_buffer)) self.feat[0:read_frames] = self.feat_buffer[0:read_frames] self.label[0:read_frames] = self.label_buffer[0:read_frames] if read_frames == len(self.feat_buffer): self.feat_buffer = None self.label_buffer = None else: self.feat_buffer = self.feat_buffer[read_frames:] self.label_buffer = self.label_buffer[read_frames:] while read_frames < self.frame_per_partition and self.sentence_index < self.num_sentences: num_frames = self.sentence_offset[self.sentence_index + 1] - self.sentence_offset[self.sentence_index] if self.file_read is file: # Not a compressed file sentence_array = numpy.fromfile(self.file_read, self.dtype, num_frames) else: nbytes = 4 * num_frames * (self.label_start_column + self.num_labels) d_tmp = self.file_read.read(nbytes) sentence_array = numpy.fromstring(d_tmp, self.dtype, num_frames) feat_mat = numpy.asarray(sentence_array['d']) label_vec = numpy.asarray(sentence_array['l']) feat_mat, label_vec = preprocess_feature_and_label(feat_mat, label_vec, self.read_opts) num_frames = len(feat_mat) if read_frames + num_frames > self.frame_per_partition: # Utterance won't fit in current partition, use some frames and keep the rest for the next partition num_frames = self.frame_per_partition - read_frames self.feat_buffer = feat_mat[num_frames:] self.label_buffer = label_vec[num_frames:] feat_mat = feat_mat[:num_frames] label_vec = label_vec[:num_frames] self.feat[read_frames:(read_frames + num_frames)] = feat_mat self.label[read_frames:(read_frames + num_frames)] = label_vec read_frames += num_frames self.sentence_index += 1 if self.read_opts['random']: shuffle_feature_and_label(self.feat[:read_frames], self.label[:read_frames]) shared_x.set_value(self.feat, borrow = True) shared_y.set_value(self.label, borrow = True) self.cur_frame_num = read_frames if self.sentence_index >= self.num_sentences and self.feat_buffer is None: # move on to the next pfile self.cur_pfile_index += 1 if self.cur_pfile_index >= len(self.pfile_path_list): self.end_reading = True self.cur_pfile_index = 0 else: self.initialize_read()
def load_next_partition(self, shared_xy): shared_x, shared_y = shared_xy # read one partition from disk; data format for pfile reading # d -- features; l -- label self.dtype = numpy.dtype({ 'names': ['d', 'l'], 'formats': [('>f', self.original_feat_dim), '>i'], 'offsets': [self.feat_start_column * 4, self.label_start_column * 4] }) if self.feat_buffer is None: read_frames = 0 else: # An utterance hasn't been completely consumed yet read_frames = min(self.frame_per_partition, len(self.feat_buffer)) self.feat[0:read_frames] = self.feat_buffer[0:read_frames] self.label[0:read_frames] = self.label_buffer[0:read_frames] if read_frames == len(self.feat_buffer): self.feat_buffer = None self.label_buffer = None else: self.feat_buffer = self.feat_buffer[read_frames:] self.label_buffer = self.label_buffer[read_frames:] while read_frames < self.frame_per_partition and self.sentence_index < self.num_sentences: num_frames = self.sentence_offset[self.sentence_index + 1] - self.sentence_offset[ self.sentence_index] if self.file_read is file: # Not a compressed file sentence_array = numpy.fromfile(self.file_read, self.dtype, num_frames) else: nbytes = 4 * num_frames * (self.label_start_column + self.num_labels) d_tmp = self.file_read.read(nbytes) sentence_array = numpy.fromstring(d_tmp, self.dtype, num_frames) feat_mat = numpy.asarray(sentence_array['d']) label_vec = numpy.asarray(sentence_array['l']) feat_mat, label_vec = preprocess_feature_and_label( feat_mat, label_vec, self.read_opts) num_frames = len(feat_mat) if read_frames + num_frames > self.frame_per_partition: # Utterance won't fit in current partition, use some frames and keep the rest for the next partition num_frames = self.frame_per_partition - read_frames self.feat_buffer = feat_mat[num_frames:] self.label_buffer = label_vec[num_frames:] feat_mat = feat_mat[:num_frames] label_vec = label_vec[:num_frames] self.feat[read_frames:(read_frames + num_frames)] = feat_mat self.label[read_frames:(read_frames + num_frames)] = label_vec read_frames += num_frames self.sentence_index += 1 if self.read_opts['random']: shuffle_feature_and_label(self.feat[:read_frames], self.label[:read_frames]) shared_x.set_value(self.feat, borrow=True) shared_y.set_value(self.label, borrow=True) self.cur_frame_num = read_frames if self.sentence_index >= self.num_sentences and self.feat_buffer is None: # move on to the next pfile self.cur_pfile_index += 1 if self.cur_pfile_index >= len(self.pfile_path_list): self.end_reading = True self.cur_pfile_index = 0 else: self.initialize_read()