def reset(self): if self.shuffle: self.source, self.target = shuffle.main( [self.source_orig, self.target_orig], temporary=True) else: self.source.seek(0) self.target.seek(0)
def __init__(self, source, source_dict, batch_size=128, n_words_source=-1, skip_empty=False, shuffle_each_epoch=False, sort_by_length=False, maxibatch_size=20, ): if shuffle_each_epoch: self.source_orig = source self.source = shuffle.main([self.source_orig], temporary=True) else: self.source = data_utils.fopen(source, 'r') self.source_dict = load_dict(source_dict) self.batch_size = batch_size self.skip_empty = skip_empty self.n_words_source = n_words_source if self.n_words_source > 0: for key, idx in self.source_dict.items(): if idx >= self.n_words_source: del self.source_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.label_buffer =[] self.k = batch_size * maxibatch_size self.end_of_data = False
def __init__(self, source, target, source_dict, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, maxibatch_size=20): if shuffle_each_epoch: self.source_orig = source self.target_orig = target self.source, self.target = shuffle.main( [self.source_orig, self.target_orig], temporary=True) else: self.source = open(source, 'r', encoding='utf-8') self.target = open(target, 'r', encoding='utf-8') self.source_dict = load_dict(source_dict) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.n_words_source = n_words_source self.n_words_target = n_words_target # todo: limit vocabulary size if self.n_words_source > 0: for key, idx in self.source_dict.items(): # add list(...) if idx >= self.n_words_source: del self.source_dict[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False
def reset(self): if self.shuffle: self.source = shuffle.main([self.source_orig], temporary=True) else: self.source.seek(0)