def run_fusion_extractive_abstractive(extractive_output, abstractive_output, ext_abs_fusion_outfile, ext_abs_fusion_config): ext_abs_data_raw, metadata = load_ext_abs_score_data( extractive_output, abstractive_output) ext_abs_data = torch.FloatTensor(ext_abs_data_raw) parameters = { k: torch.FloatTensor(v) for k, v in ext_abs_fusion_config["parameters"].items() } ext_abs_logits = F.linear(ext_abs_data, **parameters).squeeze(-1).tolist() with jsonlines.open(extractive_output, "r") as reader_outputs, \ jsonlines.open(ext_abs_fusion_outfile, "w") as ofwriter: for e, q, logit, proposed_answers, scores, best_span_index in zip( reader_outputs, metadata["questions"], ext_abs_logits, metadata['proposed_answers'], ext_abs_data_raw, metadata['ext_best_span_idx']): assert q == e["raw_question"] assert proposed_answers[0] == e["answers"][best_span_index] # 0 is extractive class, 1 is abstractive class decision = int(logit > 0) e["reader_scores"] = [scores[decision]] e["answers"] = [proposed_answers[decision]] if decision: # abstractive does not contain span info del e["passages"] del e["char_offsets"] else: e["passages"] = [e["passages"][best_span_index]] e["char_offsets"] = [e["char_offsets"][best_span_index]] ofwriter.write(e)
def main(): parser = argparse.ArgumentParser() parser.add_argument("host", help="Elasticsearch host.") parser.add_argument("-p", "--port", default=9200, help="port, default is 9200", type=int) parser.add_argument("--question_paths", nargs="+", type=str, required=True, help="Path to the questions.") args = parser.parse_args() host = args.host port = args.port for path in args.question_paths: updated_questions = list() resolved_contexts = [] raw_contexts = [] f_context = partial(query_es_bulk, host=host, port=port) with jsonlines.open(path) as reader: questions = list(reader) for result in tqdm(Pool().imap(f_context, questions), total=len(questions)): resolved_contexts.append(result) for question, per_choice_contexts in zip(questions, resolved_contexts): raw_context = {} q_updated = question.copy() for choice, contexts in zip(question["question"]["choices"], per_choice_contexts): choice["para"] = " ".join( [c["hit"]["text"] for c in contexts]) raw_context[choice["label"]] = contexts updated_questions.append(q_updated) raw_contexts.append({question["id"]: raw_context}) base_dir = os.path.dirname(path) base = os.path.basename(path) name = os.path.splitext(base)[0] with jsonlines.open(os.path.join(base_dir, name + "_with_hits.jsonl"), "w") as writer: writer.write_all(raw_contexts) with jsonlines.open(os.path.join(base_dir, name + "_with_para.jsonl"), "w") as writer: writer.write_all(updated_questions)
def div_raw(rot_seq: Sequence[List[List[object]]], prob_train: float, train_fn: str, dev_fn: str): with jsonlines.open(train_fn, mode='w') as writer_t: with jsonlines.open(dev_fn, mode='w') as writer_d: for s in rot_seq: if prob_train < random.random(): writer_t.write(s) else: writer_d.write(s)
def extract_dataset(infile, outfile, negative_X=1, golden_passages=None): if not golden_passages: golden_passages = get_golden_passages(infile) # too slow print("Precomputing negatives...\n") all_negative_passages = list(set(range(total_length)) - golden_passages) def get_random_negative(): return random.choice(all_negative_passages) print("Constructing dataset...\n") total_processed_examples = 0 total_created_examples = 0 with jsonlines.open(outfile, mode='w') as writer: for example_idx, example in enumerate(jsonlines.open(infile)): if not example['is_mapped'] or not example['contexts']['positive_ctx'] in golden_passages: continue total_processed_examples += 1 # write positive title, passage = para_db.get_doc_text(example['contexts']['positive_ctx'], ['raw_document_title', 'raw_paragraph_context']) total_created_examples += 1 writer.write({ "id": example['example_id'], "title": title, "psg": passage, "label": 0, }) # assert example['title'] == title, f"Titles do not match {example['title']} /=/ {title}" # write negatives raw_negatives_ids = [get_random_negative() for _ in range(negative_X)] raw_negative_titles, raw_negatives = [], [] for negative_id in raw_negatives_ids: title, text = para_db.get_doc_text(negative_id, columns=["raw_document_title", "raw_paragraph_context"]) raw_negative_titles.append(title) raw_negatives.append(text) for n_id, n_title, n_psg in zip(raw_negatives_ids, raw_negative_titles, raw_negatives): total_created_examples += 1 writer.write({ "id": n_id, "title": n_title, "psg": n_psg, "label": 1, }) if total_processed_examples % 2000 == 0 and total_processed_examples > 0: print(f"Processed {total_processed_examples} examples") print(f"Created {total_created_examples} examples") print(f"Total processed {total_processed_examples} examples") print(f"Total created {total_created_examples} examples")
def extract_predictions(reader_output, outfile): with jsonlines.open(reader_output, mode="r") as reader: with jsonlines.open(outfile, mode='w') as writer: logging.info("Extracting answers") for e in reader: pred_answer = e['answers'][argmax(e['reader_scores'])] prediction = { "question": e['raw_question'], "prediction": pred_answer } writer.write(prediction)
def convert(filepath): fin = open(filepath, 'r', encoding='utf-8', newline='\n', errors='ignore') lines = fin.readlines() fin.close() all_data = [] with tqdm(total=len(lines)) as pbar: for i in range(0, len(lines), 3): text_left, _, text_right = [ s.lower().strip() for s in lines[i].partition("$T$") ] target_phrase = lines[i + 1].lower().strip() polarity = lines[i + 2].strip() data = {} data['text_left'] = text_left data['text_right'] = text_right data['target_phrase'] = target_phrase data['polarity'] = polarity all_data.append(data) pbar.update(3) with jsonlines.open(filepath + '.jsonl', 'w') as writer: writer.write_all(all_data)
def load_raw_test_toefl(): pos_tags = {} with jsonlines.open(f'{TOEFL_TEST}/toefl_skll_test_features_no_labels/all_pos/P.jsonlines') as reader: for obj in reader: txt_id, sent_id, word_id, word = obj['id'].split("_") pos_tag = obj['x']['stanford_postag'] pos_tags[(txt_id, int(sent_id), int(word_id))] = pos_tag toefl_test_sents = {} # (essay, sentence_id) mapping to sentence raw_test_toefl = [] for filename in os.listdir(f'{TOEFL_TEST}/essays'): fileid = filename.split('.')[0] with open(f'{TOEFL_TEST}/essays/{filename}') as f: lines = [line.rstrip() for line in f] for i in range(len(lines)): tok_sent = lines[i].split() sentence = lines[i] toefl_test_sents[(fileid, i + 1)] = sentence for j in range(len(tok_sent)): if (fileid, i+1, j+1) in pos_tags: pos_tag = pos_tags[(fileid, i+1, j+1)] if pos_tag.startswith('V'): raw_test_toefl.append([sentence, j, f'{fileid}_{i+1}_{j+1}']) # sentence, verb_idx, key return raw_test_toefl, toefl_test_sents
def get_golden_passages(data_source): dataset = jsonlines.open(data_source) try: r = set([ex['contexts']['positive_ctx'] for ex in dataset if ex['is_mapped']]) finally: dataset.close() return r
def load_raw_train_toefl(): pos_and_label = {} with jsonlines.open(f'{TOEFL_TRAIN}/toefl_skll_train_features/all_pos/P.jsonlines') as reader: for obj in reader: txt_id, sent_id, word_id, word = obj['id'].split("_") pos_tag = obj['x']['stanford_postag'] label = obj['y'] pos_and_label[(txt_id, int(sent_id), int(word_id))] = (pos_tag, label) toefl_train_sents = {} # (essay, sentence_id) mapping to sentence raw_train_toefl = [] for filename in os.listdir(f'{TOEFL_TRAIN}/essays'): fileid = filename.split('.')[0] with open(f'{TOEFL_TRAIN}/essays/{filename}') as f: lines = [line.rstrip() for line in f] for i in range(len(lines)): tok_sent = lines[i].split() sentence = lines[i].replace('M_', '') toefl_train_sents[(fileid, i + 1)] = sentence for j in range(len(tok_sent)): if (fileid, i+1, j+1) in pos_and_label: pos_tag, label = pos_and_label[(fileid, i+1, j+1)] if pos_tag.startswith('V'): raw_train_toefl.append([sentence, j, int(label)]) # sentence, verb_idx, label return raw_train_toefl, toefl_train_sents
def post_training( self, selected_model_path, selected_model_filename, test_data_loader, selected_model_dev_stats, time_training_elapsed_mins, ): logger.info("loading selected model from training: {}".format( selected_model_path)) self.model.load_state_dict(torch.load(selected_model_path)) logger.info("evaluating selected model on test-set") # set model into evaluation mode (cf. https://pytorch.org/docs/stable/nn.html#torch.nn.Module.train) self.model.eval() # do the actual evaluation filepath_stats_prefix = os.path.join(self.opt.experiment_path, "statistics", selected_model_filename) os.makedirs(filepath_stats_prefix, exist_ok=True) if not filepath_stats_prefix.endswith("/"): filepath_stats_prefix += "/" test_stats = self._evaluate(test_data_loader, get_examples=True, basepath=filepath_stats_prefix) test_snem = test_stats[self.opt.snem] self.evaluator.print_stats(test_stats, "evaluation on test-set") # save dev and test results experiment_results = {} experiment_results["test_stats"] = self.get_serializable_stats( test_stats) experiment_results["dev_stats"] = self.get_serializable_stats( selected_model_dev_stats) experiment_results["options"] = self.get_serializable_opts() experiment_results[ "time_training_elapsed_mins"] = time_training_elapsed_mins experiment_results_path = os.path.join(self.opt.experiment_path, "experiment_results.jsonl") with jsonlines.open(experiment_results_path, "w") as writer: writer.write(experiment_results) # save confusion matrices test_confusion_matrix = test_stats["confusion_matrix"] create_save_plotted_confusion_matrix( test_confusion_matrix, expected_labels=self.sorted_expected_label_values, basepath=filepath_stats_prefix, ) logger.info("finished execution of this run. exiting.") # print snem pad_value to stdout, for the controller to parse it print(test_snem)
def load_pos_annotations(ex_features, directory): with jsonlines.open(f"{directory}/P.jsonlines") as reader: for obj in reader: txt_id, sent_id, word_id = obj['id'].split("_") stanford_postag = obj['x']['postag'] ex_features[(txt_id, sent_id, int(word_id))]['stanford_postag'] = stanford_postag return ex_features
def original_sentences(filename: str) -> Sequence[List[str]]: stored_cache: Set = set() with jsonlines.open(filename) as reader: for sentence in tqdm(reader, total=2230373): key = '-:-'.join(sentence) if key in stored_cache: continue stored_cache.add(key) yield sentence
def run_score_aggregation(outputs, aggregation_config, aggregation_outfile): pipeline_data, metadata = load_pipeline_data(outputs, aggregation_config) pipeline_data = torch.FloatTensor(pipeline_data).transpose(-1, -2) parameters = { k: torch.FloatTensor(v) for k, v in aggregation_config["parameters"].items() } aggregated_logits = F.linear(pipeline_data, **parameters).squeeze(-1).tolist() with jsonlines.open(outputs["reader_output"], "r") as reader_outputs, \ jsonlines.open(aggregation_outfile, "w") as ofwriter: for e, q, logits in zip(reader_outputs, metadata["questions"], aggregated_logits): assert q == e["raw_question"] e["reader_scores"] = logits ofwriter.write(e)
def get_example_list(self): with open(self.datafile, encoding="utf-8") as f: num_lines = sum(1 for line in f) examples = [] with jsonlines.open(self.datafile, "r") as fd: for idx, sample in tqdm(enumerate(fd), total=num_lines): # TODO: parallelize? if self.is_training: examples += FusionInDecoderDataset.process_sample( sample, database=self.database, tokenizer=self.tokenizer, max_input_length=self.max_len, context_size=self.context_length, include_doc_masks=self.include_passage_masks, include_golden_passage=self.include_golden_passage, preprocessing_truncation=self.preprocessing_truncation, one_answer_per_question=self.one_answer_per_question, use_only_human_answer=self.use_only_human_answer) else: # Do not use same question with multiple answers in validation examples += [ FusionInDecoderDataset.process_sample( sample, database=self.database, tokenizer=self.tokenizer, max_input_length=self.max_len, context_size=self.context_length, include_doc_masks=self.include_passage_masks, include_golden_passage=False, preprocessing_truncation=self. preprocessing_truncation)[0] ] if idx == 0: logging.info("Example of input formats:") src_example1 = " ".join( self.tokenizer.convert_ids_to_tokens( examples[0]["sources"][0])) src_example2 = " ".join( self.tokenizer.convert_ids_to_tokens( examples[0]["sources"][1])) if len(examples[0]["target"]) > 1: possible_target = examples[0]["target"] if type(possible_target) == list: possible_target = possible_target[0] target_example = " ".join( self.tokenizer.convert_ids_to_tokens( possible_target)) logging.info("inputs 1:") logging.info(src_example1) logging.info("inputs 2:") logging.info(src_example2) if len(examples[0]["target"]) > 1: logging.info("target:") logging.info(target_example) return examples
def load_wordnet_annotations(ex_features, directory): with jsonlines.open(f"{directory}/WordNet.jsonlines") as reader: for obj in reader: txt_id, sent_id, word_id = obj['id'].split("_") wn_vector = np.zeros(26) for annotation in obj['x']: index = int(annotation.split("_")[-1]) wn_vector[index - 1] = 1 ex_features[(txt_id, sent_id, int(word_id))]['wordnet'] = wn_vector return ex_features
def get_experiment_result_detailed(experiment_path): experiment_results_path = os.path.join(experiment_path, "experiment_results.jsonl") try: with jsonlines.open(experiment_results_path, "r") as reader: lines = [] for line in reader: lines.append(line) assert len(lines) == 1 return lines[0] except FileNotFoundError: return None
def load_topic_annotations(ex_features, directory): with jsonlines.open(f"{directory}/T.jsonlines") as reader: for obj in reader: txt_id, sent_id, word_id = obj['id'].split("_") lda_vector = np.zeros(100) for annotation in obj['x']: index = int(annotation.split("-")[-1]) lda_vector[index - 1] = float(obj['x'][annotation]) ex_features[(txt_id, sent_id, int(word_id))]['topic_lda'] = lda_vector return ex_features
def load_cbias_annotations(ex_features, directory): with jsonlines.open(f"{directory}/C-BiasUp.jsonlines") as reader: for obj in reader: txt_id, sent_id, word_id = obj['id'].split("_") ccb_vector = np.zeros(17) for annotation in obj['x']: index = int(annotation.split("-")[-1]) ccb_vector[index - 1] = int(obj['x'][annotation]) ex_features[(txt_id, sent_id, int(word_id))]['cbiasup'] = ccb_vector with jsonlines.open(f"{directory}/C-BiasDown.jsonlines") as reader: for obj in reader: txt_id, sent_id, word_id = obj['id'].split("_") ccb_vector = np.zeros(17) for annotation in obj['x']: index = int(annotation.split("-")[-1]) ccb_vector[index - 1] = int(obj['x'][annotation]) ex_features[(txt_id, sent_id, int(word_id))]['cbiasdown'] = ccb_vector return ex_features
def load_label_ul_annotations(ex_features, directory, include_labels=True): with jsonlines.open(f"{directory}/UL.jsonlines") as reader: for obj in reader: txt_id, sent_id, word_id = obj['id'].split("_") ex_id = (txt_id, sent_id, int(word_id)) (_, ul), = obj["x"].items() ex_features[ex_id] = {} ex_features[ex_id]['ul'] = ul ex_features[ex_id]['id'] = ex_id if include_labels: label = int(obj['y']) ex_features[ex_id]['label'] = label return ex_features
def _read_json_file(self, file: Path) -> List: with open(file, mode='r') as f: total_count = sum([1 for _ in f]) data_json = [] with jsonlines.open(file) as reader: for obj in self.iterate(reader, f'Reading file {file.name}', total_count=total_count): if 'field' in obj: data_json.append(obj['field']) data_json[-1]['entityId'] = obj['entityId'] else: data_json.append(obj) return data_json
def download(version, directory: str, dataset: list): if version["version"] != "Mediapipe": raise ValueError("Running this addon version is not implemented") poses_dir = path.join(directory, "poses") makedir(poses_dir) Docker.verify_image_exists(DOCKER_NAME) should_cleanup = False while True: existing = {path.join(poses_dir, di) for di in os.listdir(poses_dir)} missing_data = [] for datum in dataset: datum["pose_dir"] = path.join(poses_dir, datum["id"]) if datum["pose_dir"] not in existing: missing_data.append(datum) # Break when finished if len(missing_data) == 0: break print(missing_data) should_cleanup = True print("Done", len(dataset) - len(missing_data), "/", len(dataset), "tasks") # should_cleanup = False # for datum in tqdm(missing_data): # pose_video(datum) distributed.clear_tasks() distributed.kill_slaves() clean_dockers() distributed.spawn_workers().flower() distributed.run(pose_video, missing_data[:50000]) if should_cleanup: distributed.kill_slaves() clean_dockers() with jsonlines.open(path.join(directory, "index.jsonl"), mode='w') as writer: for datum in tqdm(dataset): writer.write({ "id": datum["id"], "poses": get_directory_hands(datum["pose_dir"]) })
def process_source_json(args): # Map story id and position to dicts so can be extracted and analysed separately. story_map = defaultdict(lambda: list()) with jsonlines.open(args["source_json"], mode='r') as reader: for i, json_obj in enumerate(reader): story_id = json_obj["metadata"]["story_id"] abs_pos = json_obj["metadata"]["absolute_position"] rel_pos = json_obj["metadata"]["relative_position"] attribute = json_obj[args["attribute"]] story_map[story_id].append((abs_pos, rel_pos, attribute)) return story_map
def download(directory: str, version, module_path: str, dataset=None): makedir(directory) version_dir = path.join(directory, version["version"]) index_path = path.join(version_dir, 'index.jsonl') if not exists(version_dir) or not exists(index_path): makedir(version_dir) module = modular_import("module", module_path) if dataset is None: module.download(version, version_dir) else: module.download(version, version_dir, dataset) data = list(jsonlines.open(index_path)) return version_dir, data
def run_reader_extractive(checkpointDict, reader_output, reranker_output): ext_reader_cfg = config["reader"]["extractive"]["config"] cache_dir = config["transformers_cache"] checkpointDict["config"][ "cache"] = cache_dir # overwrite the old loaded cache path model = Reader(checkpointDict["config"], initPretrainedWeights=False) Checkpoint.loadModel(model, checkpointDict, config["device"]) if "multi_gpu" in ext_reader_cfg and ext_reader_cfg[ "multi_gpu"] and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) logging.info("DataParallel active!") extractor = AnswerExtractor(model, config["device"]) extractor.model.eval() tokenizer = AutoTokenizer.from_pretrained( checkpointDict["config"]['tokenizer_type'], cache_dir=cache_dir, use_fast=True) database = get_database_path() database = PassDatabase(database) with ReaderDataset( reranker_output, tokenizer, database, ext_reader_cfg["batch_size"], checkpointDict["config"]['include_doc_title']) as dataset: logging.info(f"Extracting top k answers scores") res = {} for i, (query, answers, scores, passageIds, charOffsets) in \ tqdm(enumerate(extractor.extract(dataset, ext_reader_cfg["top_k_answers"], ext_reader_cfg["max_tokens_for_answer"])), total=len(dataset)): res[i] = { "raw_question": query, "answers": answers, "reader_scores": scores, "passages": passageIds, "char_offsets": charOffsets } with jsonlines.open(reader_output, "w") as wF: for _, record in res.items(): wF.write(record)
def download_firebase_collection(args): print(f"Download Firebase collection: {args}") collection_data = [] cred = credentials.Certificate(args["firebase_key_path"]) firebase_admin.initialize_app(cred) db = firestore.client() collection_ref = db.collection(args['collection_name']) docs = collection_ref.stream() for doc in docs: doc_dict = {} doc_dict["id"] = doc.id doc_dict["document"] = doc.to_dict() doc_dict["collection"] = args['collection_name'] collection_data.append(doc_dict) with jsonlines.open(args['output_file'], mode='w') as writer: for d in collection_data: writer.write(d)
def read_results(test_number: str, policy_name: str) -> pd.DataFrame: """ Reads in a results file, returns a dataframe including ratios """ path = 'results/{}-{}'.format(test_number, policy_name) df = pd.DataFrame(columns=[ 'utilisation', 'opt_utilisation', 'oblivious_utilisation', 'sequence_type', 'graphs' ]) with jsonlines.open(path) as f: for result in f: sequence_type = result['sequence_type'] graphs = '.'.join(result['graphs']) for i in range(len(result['utilisations'])): df = df.append( { 'utilisation': result['utilisations'][i], 'opt_utilisation': result['opt_utilisations'][i], 'oblivious_utilisation': result['oblivious_utilisations'][i], 'sequence_type': sequence_type, 'graphs': graphs }, ignore_index=True) # to easily separate experiments in plots df['test_number'] = test_number df['policy_name'] = policy_name # calculate ratios df['ratio'] = df['utilisation'] / df['opt_utilisation'] df['oblivious_ratio'] = df['oblivious_utilisation'] / df['opt_utilisation'] return df
def read_result(model_id: str, test_id: str, policy_id: str) -> pd.DataFrame: """ Reads in a results file, returns a dataframe including ratios """ path = "results/overfit-{}-{}-{}".format(model_id, test_id, policy_id) df = pd.DataFrame( columns=['utilisation', 'opt_utilisation', 'oblivious_utilisation']) with jsonlines.open(path) as f: for result in f: for i in range(len(result['utilisations'])): df = df.append( { 'utilisation': result['utilisations'][i], 'opt_utilisation': result['opt_utilisations'][i], 'oblivious_utilisation': result['oblivious_utilisations'][i], 'action': result['actions'][i] }, ignore_index=True) # to easily separate experiments in plots if len(model_id) > 1: x_value = int(model_id[2]) else: x_value = int(model_id) df['model_id'] = model_id df['x_value'] = x_value df['test_id'] = test_id df['policy_id'] = policy_id # calculate ratios df['ratio'] = df['utilisation'] / df['opt_utilisation'] df['oblivious_ratio'] = df['oblivious_utilisation'] / df['opt_utilisation'] return df
def main(): domains = load_domains() names = load_entities() print("querying:") print(f'{len(domains)} domains') print(f'{len(names)} entities') print() # read domains crawled span times with open('config_data/domains_crawled_dates.json') as f_in: global domains_crawled_dates domains_crawled_dates = json.load(f_in) for name in names: print(name) f_name = '_'.join(name.split()) + '.jsonl' results = runner(domains, name) if results: with jsonlines.open(OUTPUT_DIR + "/" + f_name, mode='w') as writer: for k, v in results.items(): for r in v: writer.write(r)
def dump_results(self, dict_for_dumping, file, is_test): if self.is_writer: if is_test: assert file in self.test_files, f'The dump file {file} is not in our possible files from {self.test_files.keys()}' else: assert file in self.dev_files, f'The dump file {file} is not in our possible files from {self.dev_files.keys()}' if not is_test: file = self.dev_files[file] file_mode = 'a' else: file = self.test_files[file] file_mode = 'a' with jsonlines.open(file, file_mode) as f: for json_obj in dict_for_dumping: train_head = json_obj["train_head"] eval_slice = json_obj["eval_slice"] tag = "/".join([train_head, eval_slice]) self.tb_writer.add_scalar( tag=tag, scalar_value=json_obj["f1_micro"], global_step=json_obj["global_step"]) f.write(json_obj) return
def process_source_json(args): # Map story id and position to dicts so can be extracted and analysed separately. story_id_map = defaultdict(lambda: list()) positions_map = defaultdict(lambda: list()) story_ids = [] absolute_positions = [] story_ids_and_pos = [] source_embeddings = [] target_embeddings = [] with jsonlines.open(args["source_json"], mode='r') as reader: for i, json_obj in enumerate(reader): story_id = json_obj["metadata"]["story_id"] abs_pos = json_obj["metadata"]["absolute_position"] story_id_map[story_id].append(i) positions_map[abs_pos].append(i) story_ids.append(story_id) absolute_positions.append(abs_pos) story_ids_and_pos.append(f"{story_id}_{abs_pos}") source_embeddings.append(json_obj["source_embeddings"]) target_embeddings.append(json_obj["target_embeddings"]) source_embeddings_arr = da.from_array(source_embeddings, chunks=(1000, 1000)) target_embeddings_arr = da.from_array(target_embeddings, chunks=(1000, 1000)) if args["normalize"]: source_embeddings_arr = da.from_array( preprocessing.scale(source_embeddings_arr), chunks=(1000, 1000)) target_embeddings_arr = da.from_array( preprocessing.scale(target_embeddings_arr), chunks=(1000, 1000)) return source_embeddings_arr, target_embeddings_arr, story_ids, np.array( absolute_positions), story_ids_and_pos, story_id_map, positions_map