def __init__(self, source, target, source_dict, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, shuffle_each_epoch=False, sort_by_length=True): if shuffle_each_epoch: shuffle.main([source, target]) self.source = fopen(source+'.shuf', 'r') self.target = fopen(target+'.shuf', 'r') else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') self.source_dict = load_dict(source_dict) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.n_words_source = n_words_source self.n_words_target = n_words_target self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * 20 self.end_of_data = False
def __init__(self, source, target, source_dicts, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, indomain_source='', indomain_target='', interpolation_rate=0.1, maxibatch_size=20): if shuffle_each_epoch: self.source_orig = source self.target_orig = target self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True) self.indomain_source_orig = indomain_source self.indomain_target_orig = indomain_target self.indomain_source, self.indomain_target = shuffle.main([self.indomain_source_orig, self.indomain_target_orig], temporary=True) else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') self.indomain_source = fopen(indomain_source, 'r') self.indomain_target = fopen(indomain_target, 'r') self.source_dicts = [] for source_dict in source_dicts: self.source_dicts.append(load_dict(source_dict)) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: for d in self.source_dicts: for key, idx in d.items(): if idx >= self.n_words_source: del d[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False self.interpolation_rate = interpolation_rate self.cur_interpolation_rate = self.interpolation_rate self.indomain_k = int(math.ceil(self.cur_interpolation_rate * self.k)) self.outdomain_k = self.k - self.indomain_k
def __init__(self, source, target, source_dict, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, shuffle_each_epoch=False, sort_by_length=True): if shuffle_each_epoch: shuffle.main([source, target]) self.source = fopen(source + '.shuf', 'r') self.target = fopen(target + '.shuf', 'r') else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') self.source_dict = load_dict(source_dict) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.n_words_source = n_words_source self.n_words_target = n_words_target self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * 20 self.end_of_data = False
def __init__(self, source, target, source_dict, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, shuffle_each_epoch=False): if shuffle_each_epoch: shuffle.main([source, target]) self.source = fopen(source + '.shuf', 'r') self.target = fopen(target + '.shuf', 'r') else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') with open(source_dict, 'rb') as f: self.source_dict = pkl.load(f) with open(target_dict, 'rb') as f: self.target_dict = pkl.load(f) self.batch_size = batch_size self.maxlen = maxlen self.n_words_source = n_words_source self.n_words_target = n_words_target self.source_buffer = [] self.target_buffer = [] self.k = batch_size * 20 self.end_of_data = False
def reset(self): if self.shuffle: shuffle.main([self.source.name.replace('.shuf',''), self.target.name.replace('.shuf','')]) self.source = fopen(self.source.name, 'r') self.target = fopen(self.target.name, 'r') else: self.source.seek(0) self.target.seek(0)
def reset(self): if self.shuffle: shuffle.main([source.name.replace('.shuf','') for source in self.sources] + [self.target.name.replace('.shuf','')]) self.sources = [fopen(source.name, 'r') for source in self.sources] self.target = fopen(self.target.name, 'r') else: for source in self.sources: source.seek(0) self.target.seek(0)
def __init__(self, source, target, source_dicts, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, shuffle_each_epoch=False, sort_by_length=True, indomain_source='', indomain_target='', interpolation_rate=0.1, maxibatch_size=20): if shuffle_each_epoch: shuffle.main([source, target]) shuffle.main([indomain_source, indomain_target]) self.source = fopen(source+'.shuf', 'r') self.target = fopen(target+'.shuf', 'r') self.indomain_source = fopen(indomain_source+'.shuf', 'r') self.indomain_target = fopen(indomain_target+'.shuf', 'r') else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') self.indomain_source = fopen(indomain_source, 'r') self.indomain_target = fopen(indomain_target, 'r') self.source_dicts = [] for source_dict in source_dicts: self.source_dicts.append(load_dict(source_dict)) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: for d in self.source_dicts: for key, idx in d.items(): if idx >= self.n_words_source: del d[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False self.interpolation_rate = interpolation_rate self.indomain_k = int(math.ceil(self.interpolation_rate * self.k)) self.outdomain_k = self.k - self.indomain_k
def do_test(self, testNumber): testFile = self.testDataFolder + "/" + str(testNumber) main(testFile + ".in", testFile + "_actual.out") # compare the result expectedOut = open(testFile + ".out", 'r') actualOut = open(testFile + "_actual.out", 'r') expectedLines = expectedOut.readlines() actualLines = actualOut.readlines() expectedOut.close() actualOut.close() self.assertEqual(actualLines, expectedLines)
def __init__( self, source, target, source_dicts, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, shuffle_each_epoch=False, sort_by_length=True, maxibatch_size=20, ): global epoch_num if shuffle_each_epoch: shuffle.main([source, target], epoch_num) self.source = fopen(source + '.shuf', 'r') self.target = fopen(target + '.shuf', 'r') else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') self.source_dicts = [] for source_dict in source_dicts: self.source_dicts.append(load_dict(source_dict)) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: for d in self.source_dicts: for key, idx in d.items(): if idx >= self.n_words_source: del d[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False self.embeddings = embeddings
def reset(self): global epoch_num epoch_num += 1 if self.shuffle: shuffle.main([ self.source.name.replace('.shuf', ''), self.target.name.replace('.shuf', '') ], epoch_num) self.source = fopen(self.source.name, 'r') self.target = fopen(self.target.name, 'r') else: self.source.seek(0) self.target.seek(0)
def reset(self): # clear buffers for new epoch self.source_buffer = [] self.target_buffer = [] self.pseudo_source_buffer = [] self.pseudo_target_buffer = [] if self.shuffle: self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True) self.pseudo_source, self.pseudo_target = shuffle.main([self.pseudo_source_orig, self.pseudo_target_orig], temporary=True) else: self.source.seek(0) self.target.seek(0) self.pseudo_source.seek(0) self.pseudo_target.seek(0)
def __init__(self, source, target, source_dicts, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, shuffle_each_epoch=False, sort_by_length=True, maxibatch_size=20): if shuffle_each_epoch: shuffle.main([source, target]) self.source = fopen(source+'.shuf', 'r') self.target = fopen(target+'.shuf', 'r') else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') self.source_dicts = [] for source_dict in source_dicts: self.source_dicts.append(load_dict(source_dict)) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: for d in self.source_dicts: for key, idx in d.items(): if idx >= self.n_words_source: del d[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size print "K=", self.k self.end_of_data = False
def reset(self): if self.shuffle: shuffled = shuffle.main(self.source_orig+[self.target_orig], temporary=True) self.all_sources, self.target = shuffled[:-1], shuffled[-1] else: [ss.seek(0) for ss in self.all_sources] self.target.seek(0)
def reset(self): if self.shuffle: self.source, self.target = shuffle.main( [self.source_orig, self.target_orig], temporary=True) else: self.source.seek(0) self.target.seek(0)
def __init__(self, source, batch_size=128, max_len=100, skip_empty=False, shuffle_each_epoch=False, sort_by_length=False, max_batch_size=20, min_len=None): if shuffle_each_epoch: self.source_orig = source self.source = shuffle.main(self.source_orig, temporary=True) else: self.source = fopen(source, 'r') self.source_dicts = [] self.batch_size = batch_size self.max_len = max_len self.min_len = min_len self.skip_empty = skip_empty self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.k = batch_size * max_batch_size self.end_of_data = False
def __init__(self, sources, target, source_dicts, target_dict, batch_size=128, maxlen=100, n_words_source=[-1], n_words_target=-1, shuffle_each_epoch=False, sort_by_length=True, maxibatch_size=20): if shuffle_each_epoch: shuffle.main(sources + [target]) self.sources = [fopen(source+'.shuf', 'r') for source in sources] self.target = fopen(target+'.shuf', 'r') else: self.sources = [fopen(source, 'r') for source in sources] self.target = fopen(target, 'r') self.source_dicts = [] for factor_dicts in source_dicts: self.source_dicts.append([load_dict(source_dict) for source_dict in factor_dicts]) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.n_words_source = n_words_source self.n_words_target = n_words_target for i, n_words in enumerate(self.n_words_source): if n_words > 0: for d in self.source_dicts[i]: for key, idx in d.items(): if idx >= n_words: del d[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffers = [list() for _ in range(len(self.sources))] self.target_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False
def __init__(self, source, target, source_dicts, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, use_factor=False, maxibatch_size=20, keep_data_in_memory=False): if keep_data_in_memory: self.source, self.target = FileWrapper(source), FileWrapper(target) if shuffle_each_epoch: r = numpy.random.permutation(len(self.source)) self.source.shuffle_lines(r) self.target.shuffle_lines(r) elif shuffle_each_epoch: self.source_orig = source self.target_orig = target self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True) else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') self.source_dicts = [] for source_dict in source_dicts: self.source_dicts.append(load_dict(source_dict)) self.target_dict = load_dict(target_dict) self.keep_data_in_memory = keep_data_in_memory self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.use_factor = use_factor self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: for d in self.source_dicts: for key, idx in d.items(): if idx >= self.n_words_source: del d[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False
def __init__(self, source, uid_voc, mid_voc, cat_voc, batch_size=128, max_len=100, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, max_batch_size=20, min_len=None): if shuffle_each_epoch: self.source_orig = source self.source = shuffle.main(self.source_orig, temporary=True) else: self.source = fopen(source, 'r') self.source_dicts = [] for source_dict in [uid_voc, mid_voc, cat_voc]: self.source_dicts.append(load_dict(source_dict)) f_meta = open( "../taobao_data_process/central_taobao_data/taobao-item-info", "r") meta_map = {} # 最原始的item id与cate id的映射 for line in f_meta: arr = line.strip().split("\t") if arr[0] not in meta_map: meta_map[arr[0]] = arr[1] self.meta_id_map = {} # 存储新的item id与 cate id对应的map (基于voc) for key in meta_map: val = meta_map[key] if key in self.source_dicts[1]: mid_idx = self.source_dicts[1][key] else: mid_idx = 0 if val in self.source_dicts[2]: cat_idx = self.source_dicts[2][val] else: cat_idx = 0 self.meta_id_map[mid_idx] = cat_idx self.batch_size = batch_size self.max_len = max_len self.min_len = min_len self.skip_empty = skip_empty self.n_uid = len(self.source_dicts[0]) self.n_mid = len(self.source_dicts[1]) self.n_cat = len(self.source_dicts[2]) self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.k = batch_size * max_batch_size self.end_of_data = False
def reset(self, x): if self.shuffle: self.sources[x], self.targets[x] = shuffle.main([self.sources_orig[x], self.targets_orig[x]], temporary=True) else: self.sources[x].seek(0) self.targets[x].seek(0) self.finished_files[x] = True self.stop()
def __init__(self, source, target, source_dicts, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, use_factor=False, maxibatch_size=20, token_batch_size=0): if shuffle_each_epoch: self.source_orig = source self.target_orig = target self.source, self.target = shuffle.main( [self.source_orig, self.target_orig], temporary=True) else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') self.source_dicts = [] for source_dict in source_dicts: self.source_dicts.append(load_dict(source_dict)) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.use_factor = use_factor self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: for d in self.source_dicts: for key, idx in d.items(): if idx >= self.n_words_source: del d[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size self.token_batch_size = token_batch_size self.end_of_data = False
def reset(self): if self.shuffle: if self.external_reward: self.source, self.target, self.log, self.external_reward = shuffle.main( [ self.source_orig, self.target_orig, self.log_orig, self.external_reward_ori ], temporary=True) else: self.source, self.target, self.log = shuffle.main( [self.source_orig, self.target_orig, self.log_orig], temporary=True) else: self.source.seek(0) self.target.seek(0) self.log.seek(0) if self.external_reward: self.external_reward.seek(0)
def __init__(self, source, target, source_dict, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, maxibatch_size=20): if shuffle_each_epoch: self.source_orig = source self.target_orig = target self.source, self.target = shuffle.main( [self.source_orig, self.target_orig], temporary=True) else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') #for line in self.source.readlines(): #print line #aline = self.target.readline() #print aline self.source_dict = load_dict(source_dict) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: # if source number is specified for key, idx in self.source_dict.items(): if idx >= self.n_words_source: del self.source_dict[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] # source instance in memory self.target_buffer = [] # target instance in memory self.k = batch_size * maxibatch_size # number of instance in memory in total self.end_of_data = False
def reset(self): if self.shuffle: if self.keep_data_in_memory: r = numpy.random.permutation(len(self.source)) self.source.shuffle_lines(r) self.target.shuffle_lines(r) else: self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True) else: self.source.seek(0) self.target.seek(0)
def reset(self): if self.shuffle: if self.align1: self.source1, self.source2, self.target, self.align1, self.align2 = shuffle.main( [ self.source_orig1, self.source_orig2, self.target_orig, self.align_orig1, self.align_orig2 ], temporary=True) else: self.source1, self.source2, self.target = shuffle.main( [self.source_orig1, self.source_orig2, self.target_orig], temporary=True) else: self.source1.seek(0) self.source2.seek(0) self.target.seek(0) if self.align1: self.align1.seek(0) self.align2.seek(0)
def __init__(self, source, target, source_dict, target_dict, batch_size=128, maxlen=None, n_words_source=-1, n_words_target=-1, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, maxibatch_size=20): if shuffle_each_epoch: self.source_orig = source self.target_orig = target self.source, self.target = shuffle.main( [self.source_orig, self.target_orig], temporary=True) else: self.source = data_utils.fopen(source, 'r') self.target = data_utils.fopen(target, 'r') self.source_dict = load_dict(source_dict) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: for key, idx in self.source_dict.items(): if idx >= self.n_words_source: del self.source_dict[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False
def __init__(self, datasets, dicts, n_words_dicts=None, batch_size=128, maxlen=100, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, factors=1, outputs=1, maxibatch_size=20): if shuffle_each_epoch: self.datasets_orig = datasets self.datasets = shuffle.main(datasets, temporary=True) else: self.datasets = [fopen(fp, 'r') for fp in datasets] self.dicts = [] for dict_ in dicts: self.dicts.append(load_dict(dict_)) self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.factors = factors self.outputs = outputs assert len( datasets) == 1 + outputs, 'Datasets and dictionaries mismatch' self.n_words_dicts = n_words_dicts if self.n_words_dicts: for d, max_ in zip(self.dicts, self.n_words_dicts): for key, idx in d.items(): if idx >= max_: del d[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.buffers = [[] for _ in range(len(datasets))] self.k = batch_size * maxibatch_size self.end_of_data = False
def __init__(self, source, source_dicts, batch_size=128, maxlen=100, n_words_source=-1, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, maxibatch_size=20): if shuffle_each_epoch: self.source_orig = source self.source = shuffle.main([self.source_orig], temporary=True) self.source = self.source[0] # ??? print('this had better be a file:', type(self.source)) else: self.source = fopen(source, 'r') self.source_dicts = [] for source_dict in source_dicts: self.source_dicts.append(load_dict(source_dict)) self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.n_words_source = n_words_source if self.n_words_source > 0: for d in self.source_dicts: for key, idx in d.items(): if idx >= self.n_words_source: del d[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False
def __init__(self, source, uid_voc, iid_voc, cat_voc, brand_voc, batch_size=128, maxlen=100, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, max_batch_size=20, minlen=1): if shuffle_each_epoch: self.source_orig = source self.source = shuffle.main(self.source_orig, temporary=True) else: self.source = fopen(source, 'r') self.source_dicts = [] for source_dict in [uid_voc, iid_voc, cat_voc, brand_voc]: self.source_dicts.append(load_dict(source_dict)) #print(self.source_dicts[0]) self.batch_size = batch_size self.maxlen = maxlen self.minlen = minlen self.skip_empty = skip_empty self.n_uid = len(self.source_dicts[0]) self.n_iid = len(self.source_dicts[1]) self.n_cat = len(self.source_dicts[2]) self.n_brand = len(self.source_dicts[3]) self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.k = batch_size * max_batch_size self.end_of_data = False
def reset(self): if self.shuffle: self.source= shuffle.main(self.source_orig, temporary=True) else: self.source.seek(0)
def __init__(self, source, target, source_dicts, target_dict, model_type, batch_size=128, maxlen=100, source_vocab_sizes=None, target_vocab_size=None, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, use_factor=False, maxibatch_size=20, token_batch_size=0, keep_data_in_memory=False): if keep_data_in_memory: self.source, self.target = FileWrapper(source), FileWrapper(target) if shuffle_each_epoch: r = numpy.random.permutation(len(self.source)) self.source.shuffle_lines(r) self.target.shuffle_lines(r) elif shuffle_each_epoch: self.source_orig = source self.target_orig = target self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True) else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') self.source_dicts = [] for source_dict in source_dicts: self.source_dicts.append(load_dict(source_dict, model_type)) self.target_dict = load_dict(target_dict, model_type) # Determine the UNK value for each dictionary (the value depends on # which version of build_dictionary.py was used). def determine_unk_val(d): if '<UNK>' in d and d['<UNK>'] == 2: return 2 return 1 self.source_unk_vals = [determine_unk_val(d) for d in self.source_dicts] self.target_unk_val = determine_unk_val(self.target_dict) self.keep_data_in_memory = keep_data_in_memory self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.use_factor = use_factor self.source_vocab_sizes = source_vocab_sizes self.target_vocab_size = target_vocab_size self.token_batch_size = token_batch_size if self.source_vocab_sizes != None: assert len(self.source_vocab_sizes) == len(self.source_dicts) for d, vocab_size in zip(self.source_dicts, self.source_vocab_sizes): if vocab_size != None and vocab_size > 0: for key, idx in list(d.items()): if idx >= vocab_size: del d[key] if self.target_vocab_size != None and self.target_vocab_size > 0: for key, idx in list(self.target_dict.items()): if idx >= self.target_vocab_size: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False
def __init__(self, source, uid_voc, mid_voc, cat_voc, item_info, reviews_info, batch_size=128, maxlen=100, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, max_batch_size=20): if shuffle_each_epoch: self.source_orig = source self.source = shuffle.main(self.source_orig, temporary=True) else: self.source = fopen(source, 'r') self.source_dicts = [] for source_dict in [uid_voc, mid_voc, cat_voc]: self.source_dicts.append(load_dict(source_dict)) f_meta = fopen(item_info, "r") meta_map = {} for line in f_meta: arr = line.strip().split("\t") if arr[0] not in meta_map: meta_map[arr[0]] = arr[1] self.meta_id_map = {} for key in meta_map: val = meta_map[key] if key in self.source_dicts[1]: mid_idx = self.source_dicts[1][key] else: mid_idx = 0 if val in self.source_dicts[2]: cat_idx = self.source_dicts[2][val] else: cat_idx = 0 self.meta_id_map[mid_idx] = cat_idx f_review = fopen(reviews_info, "r") self.mid_list_for_random = [] for line in f_review: arr = line.strip().split("\t") tmp_idx = 0 if arr[1] in self.source_dicts[1]: tmp_idx = self.source_dicts[1][arr[1]] self.mid_list_for_random.append(tmp_idx) self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.n_uid = len(self.source_dicts[0]) self.n_mid = len(self.source_dicts[1]) self.n_cat = len(self.source_dicts[2]) self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.k = batch_size * max_batch_size self.end_of_data = False
def __init__(self, source, NUM_FEATURE, NUM_QUERY, voc_list, batch_size=128, maxlen=100, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, max_batch_size=20, minlen=None): if shuffle_each_epoch: self.source_orig = source self.source = shuffle.main(self.source_orig, temporary=True) else: self.source = fopen(source, 'r') self.source_dicts = [] for source_dict in voc_list: self.source_dicts.append(load_dict(source_dict)) self.num_feature = NUM_FEATURE self.num_query = NUM_QUERY f_meta = open("item-info", "r") meta_map = {} for line in f_meta: arr = line.strip().split("\t") if arr[0] not in meta_map: meta_map[arr[0]] = arr[1:self.num_feature] self.meta_id_map = {} for key in meta_map: if key in self.source_dicts[self.num_query]: mid_idx = self.source_dicts[self.num_query][key] else: mid_idx = 0 val = [] for i in range(len(meta_map[key])): idx = 0 cur_val = meta_map[key][i] if (cur_val in self.source_dicts[i + self.num_query + 1]): idx = self.source_dicts[i + self.num_query + 1][cur_val] val.append(idx) self.meta_id_map[mid_idx] = val f_review = open("reviews-info", "r") self.mid_list_for_random = [] for line in f_review: arr = line.strip().split("\t") tmp_idx = 0 if arr[1] in self.source_dicts[self.num_query]: tmp_idx = self.source_dicts[self.num_query][arr[1]] self.mid_list_for_random.append(tmp_idx) self.batch_size = batch_size self.maxlen = maxlen self.minlen = minlen self.skip_empty = skip_empty self.n_query = [] for i in range(self.num_query): self.n_query.append(len(self.source_dicts[i])) self.n = [] for i in range(self.num_feature): self.n.append(len(self.source_dicts[i + self.num_query])) self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.k = batch_size * max_batch_size self.end_of_data = False
def __init__(self, source, uid_voc, mid_voc, cat_voc, batch_size=128, maxlen=100, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, max_batch_size=20, minlen=None, label_type=1): if shuffle_each_epoch: self.source_orig = source self.source = shuffle.main(self.source_orig, temporary=True) else: self.source = fopen(source, 'r') self.source_dicts = [] #for source_dict in [uid_voc, mid_voc, cat_voc, cat_voc, cat_voc]:# 'item_carte_voc.pkl', 'cate_carte_voc.pkl']: for source_dict in [ uid_voc, mid_voc, cat_voc, 'item_carte_voc.pkl', 'cate_carte_voc.pkl' ]: self.source_dicts.append(load_dict(source_dict)) f_meta = open("item-info", "r") meta_map = {} for line in f_meta: arr = line.strip().split("\t") if arr[0] not in meta_map: meta_map[arr[0]] = arr[1] self.meta_id_map = {} for key in meta_map: val = meta_map[key] if key in self.source_dicts[1]: mid_idx = self.source_dicts[1][key] else: mid_idx = 0 if val in self.source_dicts[2]: cat_idx = self.source_dicts[2][val] else: cat_idx = 0 self.meta_id_map[mid_idx] = cat_idx f_review = open("reviews-info", "r") self.mid_list_for_random = [] for line in f_review: arr = line.strip().split("\t") tmp_idx = 0 if arr[1] in self.source_dicts[1]: tmp_idx = self.source_dicts[1][arr[1]] self.mid_list_for_random.append(tmp_idx) self.batch_size = batch_size self.maxlen = maxlen self.minlen = minlen self.skip_empty = skip_empty self.n_uid = len(self.source_dicts[0]) self.n_mid = len(self.source_dicts[1]) self.n_cat = len(self.source_dicts[2]) self.n_carte = [len(self.source_dicts[3]), len(self.source_dicts[4])] print("n_uid=%d, n_mid=%d, n_cat=%d" % (self.n_uid, self.n_mid, self.n_cat)) self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.k = batch_size * max_batch_size self.end_of_data = False self.label_type = label_type
def reset(self): if self.shuffle: self.source = shuffle.main(self.source_orig, temporary=True) else: self.source.seek(0)
def __init__(self, source, target, source_dicts, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, use_factor=False, maxibatch_size=20): if shuffle_each_epoch: self.source_orig = source self.target_orig = target self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True) else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') print 'scan the dataset.' for si, _ in enumerate(self.source): pass for ti, _ in enumerate(self.target): pass self.source.seek(0) self.target.seek(0) assert si == ti, 'the number of the source and target document must the same' print 'scanned {} lines'.format(si) self.source_dicts = [] for source_dict in source_dicts: self.source_dicts.append(load_dict(source_dict)) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.use_factor = use_factor self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: for d in self.source_dicts: for key, idx in d.items(): if idx >= self.n_words_source: del d[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False
def __init__(self, source, target, source_dicts, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, use_factor=False, maxibatch_size=20, align1_file=None, align2_file=None): self.source_files = source.split(",") if shuffle_each_epoch: self.source_orig1 = self.source_files[0] self.source_orig2 = self.source_files[1] self.target_orig = target if align1_file: self.align_orig1 = align1_file self.align_orig2 = align2_file self.source1, self.source2, self.target, self.align1, self.align2 = shuffle.main( [ self.source_orig1, self.source_orig2, self.target_orig, self.align_orig1, self.align_orig2 ], temporary=True) else: self.source1, self.source2, self.target = shuffle.main( [self.source_orig1, self.source_orig2, self.target_orig], temporary=True) self.align1 = None else: self.source1 = fopen(self.source_files[0], 'r') self.source2 = fopen(self.source_files[1], 'r') self.target = fopen(target, 'r') if align1_file: self.align1 = fopen(align1_file, 'r') self.align2 = fopen(align2_file, 'r') else: self.align1 = None self.source_dicts = [] for source_dict in source_dicts: self.source_dicts.append(load_dict(source_dict)) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.use_factor = use_factor self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: for d in self.source_dicts: for key, idx in d.items(): if idx >= self.n_words_source: del d[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source1_buffer = [] self.source2_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size if self.align1: self.align1_buffer = [] self.align2_buffer = [] ''' ts = [] for s in self.source1: ts.append(s) print len(ts) self.source1.seek(0) ts = [] for s in self.source2: ts.append(s) print len(ts) self.source2.seek(0) ts = [] tss = [] for s, ss in zip(self.source1, self.source2): ts.append(s) tss.append(ss) print len(ts), len(tss) self.source1.seek(0) self.source2.seek(0) ''' self.end_of_data = False