def load_dataset(self, dataset_directory, tokenizer_type): input_lines = utils.read_lines('%s/%s' % (dataset_directory, 'inputs.txt')) label_lines = utils.read_lines('%s/%s' % (dataset_directory, 'labels.txt')) input_tokens = list( map( lambda l: tokenizer.tokenize_sentence( l, vocab=self.vocab, tokenizer_type=tokenizer_type, add_start_end=False), input_lines)) label_tokens = list( map( lambda l: tokenizer.tokenize_sentence( l, vocab=self.label_vocab, tokenizer_type=tokenizer_type, add_start_end=False), label_lines)) dataset = {} dataset['input_tokens'] = input_tokens dataset['label_tokens'] = label_tokens dataset['input_max_length'] = np.max( list(map(lambda l: len(l), input_tokens))) dataset['inputs'] = input_lines dataset['labels'] = label_lines dataset['size'] = len(input_lines) return dataset
def read_raw_lines(self, base_path): inputs_path = '%s/inputs.txt' % base_path outputs_path = '%s/outputs.txt' % base_path inputs = utils.read_lines(inputs_path) outputs = utils.read_lines(outputs_path) dataset = {} dataset['inputs'] = inputs dataset['outputs'] = outputs return dataset
def save_results(dev_path, shared_path, gen_questions_path, gen_answers_start_path, gen_answers_end_path, gen_idxs_path, gen_ids_path, save_path): print("Loading dev json: %s and shared: %s" % (dev_path, shared_path)) dev_json = json.load(open(dev_path)) shared_json = json.load(open(shared_path)) print("Done loading dev json and shared") questions = utils.read_lines(gen_questions_path) answer_starts = utils.read_lines(gen_answers_start_path) answer_ends = utils.read_lines(gen_answers_end_path) idxs = utils.read_lines(gen_idxs_path) ids = utils.read_lines(gen_ids_path) keys = dev_json.keys() dataset = defaultdict(list) idx = 54 for i in range(0, len(questions)): cur_q = questions[i].split(" ") if invalid_question(cur_q): continue cur_q = dedup(cur_q) cur_ans_start = int(answer_starts[i]) cur_ans_end = int(answer_ends[i]) idx = int(idxs[i]) id = int(ids[i]) cur_par = shared_json['x'][idx][0][0] cy_0 = 0 cy_1 = len(cur_par[cur_ans_end - 1]) cy = [[cy_0, cy_1]] answerss = [cur_par[cur_ans_start:cur_ans_end]] cur_q_char = list(map(lambda token: token.split(), cur_q)) dataset['idxs'].append(idx) dataset['ids'].append(len(dataset['ids'])) dataset['cy'].append(cy) dataset['answerss'].append(answerss) dataset['span_answerss'].append(answerss) dataset['*x'].append([idx, 0]) dataset['*cx'].append([idx, 0]) dataset['*p'].append([idx, 0]) shared_json['x'][idx] dataset['y'].append([[[0, cur_ans_start], [0, cur_ans_end]]]) dataset['q'].append(cur_q) dataset['cq'].append(cur_q_char) print("Saving to path %s" % save_path) utils.save_json(dataset, save_path)
def init_from_path(self, path): """ Read lines (one vocab token per line) into counters """ self.path = path lines = utils.read_lines(path) self.init_from_array(lines)
def load_dataset(self, dataset_directory, tokenizer_type): input_lines = utils.read_lines('%s/%s' % (dataset_directory, 'inputs.txt')) output_lines = utils.read_lines('%s/%s' % (dataset_directory, 'outputs.txt')) indices_lines = utils.read_lines('%s/%s' % (dataset_directory, 'indices.txt')) context_tokens = list(map(lambda l: tokenizer.tokenize_sentence(l, vocab=self.vocab, tokenizer_type=self.context_tokenizer_type,#tokenizer_type, add_start_end=True), input_lines)) output_tokens = list(map(lambda l: tokenizer.tokenize_sentence(l, vocab=self.vocab, tokenizer_type=tokenizer_type, add_start_end=True), output_lines)) input_tokens = list(map(lambda tokens_list: tokens_list[0:-1], output_tokens)) desired_input_tokens = list(map(lambda tokens_list: tokens_list[1:], output_tokens)) indices_vals = list(map(lambda l: int(l), indices_lines)) answer_starts_path = '%s/%s' % (dataset_directory, 'answer_starts.txt') answer_ends_path = '%s/%s' % (dataset_directory, 'answer_ends.txt') dataset = {} if utils.check_file(answer_starts_path): answer_starts = utils.read_lines(answer_starts_path) answer_starts = list(map(lambda l: int(l), answer_starts)) dataset['answer_starts'] = answer_starts if utils.check_file(answer_ends_path): answer_ends = utils.read_lines(answer_ends_path) answer_ends = list(map(lambda l: int(l), answer_ends)) dataset['answer_ends'] = answer_ends print("Example context lengths %s" % len(context_tokens[2])) dataset['indices'] = indices_vals dataset['input_tokens'] = input_tokens dataset['desired_input_tokens'] = desired_input_tokens dataset['context_tokens'] = context_tokens dataset['contexts'] = input_lines dataset['inputs'] = output_lines dataset['desired_inputs'] = output_lines dataset['size'] = len(output_lines) return dataset
def load_dataset(self, dataset_directory, tokenizer_type): input_lines = utils.read_lines('%s/%s' % (dataset_directory, 'inputs.txt')) output_lines = utils.read_lines('%s/%s' % (dataset_directory, 'outputs.txt')) indices_lines = utils.read_lines('%s/%s' % (dataset_directory, 'indices.txt')) context_tokens = list(map(lambda l: tokenizer.tokenize_sentence(l, vocab=self.vocab, tokenizer_type=constants.TOKENIZER_SPECIAL_DELIMITER, add_start_end=True), input_lines)) output_tokens = list(map(lambda l: tokenizer.tokenize_sentence(l, vocab=self.vocab, tokenizer_type=tokenizer_type, add_start_end=True), output_lines)) input_tokens = list(map(lambda tokens_list: tokens_list[0:-1], output_tokens)) desired_input_tokens = list(map(lambda tokens_list: tokens_list[1:], output_tokens)) indices_vals = list(map(lambda l: int(l), indices_lines)) answer_starts_path = '%s/%s' % (dataset_directory, 'answer_starts.txt') answer_ends_path = '%s/%s' % (dataset_directory, 'answer_ends.txt') dataset = {} if utils.check_file(answer_starts_path): answer_starts = utils.read_lines(answer_starts_path) answer_starts = list(map(lambda l: int(l) + 1, answer_starts)) # We're adding <START> dataset['answer_starts'] = answer_starts if utils.check_file(answer_ends_path): answer_ends = utils.read_lines(answer_ends_path) answer_ends = list(map(lambda l: int(l) + 1, answer_ends)) # We're adding <END> dataset['answer_ends'] = answer_ends dataset['indices'] = indices_vals dataset['input_tokens'] = input_tokens dataset['desired_input_tokens'] = desired_input_tokens dataset['context_tokens'] = context_tokens dataset['contexts'] = input_lines dataset['inputs'] = output_lines dataset['desired_inputs'] = output_lines dataset['size'] = len(output_lines) return dataset
items = l.strip().split(" ") return items answer_starts_path = 'datasets/newsqa/train/answer_starts.txt' answer_ends_path = 'datasets/newsqa/train/answer_ends.txt' input_path = 'datasets/newsqa/train/inputs.txt' output_path = 'datasets/newsqa/train/outputs.txt' generated_path = 'logs/newsqa_saved_data/dummy5_train_predictions_epoch_6.txt' indices_path = 'datasets/newsqa/train/indices.txt' inputs = utils.read_lines_with_func(func, input_path) outputs = utils.read_tabbed_lines(output_path) generated = utils.read_lines_with_func(gen_func, generated_path) answer_starts = list( map(lambda l: int(l), utils.read_lines(answer_starts_path))) answer_ends = list(map(lambda l: int(l), utils.read_lines(answer_ends_path))) indices = list(map(lambda l: int(l), utils.read_lines(indices_path))) answers = [] truncated_contexts = [] questions = [] generated_questions = [] num_overlap = [] num_items = len(generated) question_counter = 0 generated_question_counter = 0 filtered_words = ["a", "the", "who", "what", "when", "where", "why", "it"] for i in range(num_items):