def make_data(revs, word_idx_map, max_l=50, filter_h=3, val_test_splits=[2, 3], validation_num=500000): """ Transforms sentences into a 2-d matrix. """ version = begin_time() train, val, test = [], [], [] for rev in revs: sent = get_idx_from_sent_msg(rev["m"], word_idx_map, max_l, True) sent += get_idx_from_sent(rev["r"], word_idx_map, max_l, True) sent += get_session_mask(rev["m"]) sent.append(int(rev["y"])) if len(val) >= validation_num: train.append(sent) else: val.append(sent) train = np.array(train, dtype="int") val = np.array(val, dtype="int") test = np.array(test, dtype="int") print('trainning data', len(train), 'val data', len(val), 'spend time:', spend_time(version)) return [train, val, test]
def embedding_test_master(self, input_file, embedding_file, block_size=10000): """ the master of mult-Theading for test by embedding model """ version = begin_time() self.word2vec = load_bigger(embedding_file) self.origin_sample = load_bigger(input_file) threadings = queue.Queue() waitthreadings = queue.Queue() num = len(self.origin_sample) start = 0 end = min(block_size, num - 1) for block in range(int(num / block_size) + 1): work = threading.Thread(target=self.embedding_test_agent, args=( start, end, block, )) threadings.put(work) start = end + 1 end = min(num - 1, block_size * (block + 2)) while not threadings.empty(): tempwork = threadings.get() tempwork.start() waitthreadings.put(tempwork) while not waitthreadings.empty(): waitthreadings.get().join() result = [self.wordresult[k] for k in sorted(self.wordresult.keys())] results = sum(result, []) totalnum = int(len(results)) correctnum = 0 top3num = 0 block_sizes = 10 for index in range(int(totalnum / block_sizes)): pre = results[index * block_sizes:(index + 1) * block_sizes] temp_index = np.array(pre).argmax() top3 = np.array(pre).argsort()[-3:][::-1] if not temp_index: correctnum += 1 if 0 in top3: top3num += 1 print(correctnum, top3num, int(totalnum / block_sizes), spend_time(version), str(correctnum / int(totalnum / block_sizes))[:5], str(top3num / int(totalnum / block_sizes))[:5]) end_time(version)
def make_data_train(revs, word_idx_map, max_l=50, validation_num=50000, block_size=200000): """ Transforms sentences into a 2-d matrix. """ version = begin_time() test = [] threadings = queue.Queue() waitthreadings = queue.Queue() num = len(revs) start = 0 end = min(block_size, num - 1) for block in range(int(num / block_size) + 1): work = threading.Thread(target=make_data_theading, args=( revs, word_idx_map, max_l, validation_num, start, end, )) threadings.put(work) start = end + 1 end = min(num - 1, block_size * (block + 2)) while not threadings.empty(): tempwork = threadings.get() tempwork.start() waitthreadings.put(tempwork) while not waitthreadings.empty(): waitthreadings.get().join() global trains, vals train = sum(trains, []) val = sum(vals, []) train = np.array(train, dtype="int") val = np.array(val, dtype="int") test = np.array(test, dtype="int") print('trainning data', len(train), 'val data', len(val), 'spend time:', spend_time(version)) return [train, val, test]
def calculate_test(self, input_file, block_size=10): """ calculate result """ version = begin_time() with codecs.open(input_file, 'r', 'utf-8') as f: results = f.readlines() totalnum = int(len(results)) correctnum = 0 top3num = 0 for index in range(int(totalnum / block_size)): pre = results[index * block_size:(index + 1) * block_size] temp_index = np.array(pre).argmax() top3 = np.array(pre).argsort()[-3:][::-1] if not temp_index: correctnum += 1 if 0 in top3: top3num += 1 print(correctnum, top3num, int(totalnum / block_size), spend_time(version), str(correctnum / int(totalnum / block_size))[:5], str(top3num / int(totalnum / block_size))[:5]) return str(correctnum / int(totalnum / block_size))[:5]