i_leq_j_mask = tf.expand_dims(i_leq_j_mask ,0) logits = tf.expand_dims(start_logits ,2 ) +tf.expand_dims(end_logits ,1 ) +cross logits -= 10000 *i_leq_j_mask logits = tf.reshape(logits, [batch_size ,seq_length *seq_length]) short_p = tf.nn.softmax(logits) indices = tf.argsort(short_p ,axis=1 ,direction='DESCENDING')[: ,:n_keep] short_p = tf.gather(short_p ,indices ,batch_dims=1) return dict(unique_id = unique_id, ans_logits= ans_logits, long_p = long_p, short_p = short_p, short_p_indices = indices) test=read_dataset() predictions={} for tema in test: for cosa in tema["paragraphs"]: text=cosa["context"] for question in cosa["qas"]: unique_id=question["id"] text_tokens = tokenizer.tokenize(text) question_len=np.array([len(tokenizer.tokenize(question["question"]))]) data_len = np.array([512]) input_word_ids,input_mask,segment_id =convert_two_sentences_to_features(question["question"],text,tokenizer,512) c=[data_len.reshape(-1,),input_word_ids.reshape(1,-1), question_len.reshape(-1,)]
# writer.write(serialized_features_dataset) # writer.close() # filenames = [filename] # train_dataset=tf.data.TFRecordDataset('x_test.tfrecord').batch(32) # for serialized_example in train_dataset: # for elem in serialized_example # example = tf.train.Example() # example.ParseFromString(elem) # x_1 = np.array(example.features.feature['X'].float_list.value) # y_1 = np.array(example.features.feature['Y'].float_list.value) # break # # path = read_dataset(mode="train", dataset="naturalq", tokenizer=tokenizer, max_seq_length=max_seq_length, fragmented=False) # # import datetime # t = datetime.datetime.now().time() # log_name = "Salida_modelo_{}.txt".format(t) x, y = crear_batch(path, fragmented=False) N = len(x) entrada = { "questions_id": np.squeeze(x[:N, 3].astype(np.int32)), "question_input_mask": np.squeeze(x[:N, 4].astype(np.int32)), "question_segment_id": np.squeeze(x[:N, 5].astype(np.int32)), "context_id": np.squeeze(x[:N, 0].astype(np.int32)), "context_input_mask": np.squeeze(x[:N, 1].astype(np.int32)), "context_segment_id": np.squeeze(x[:N, 2].astype(np.int32))
# writer = tf.data.experimental.TFRecordWriter(filename) # writer.write(serialized_features_dataset) # writer.close() # filenames = [filename] # train_dataset=tf.data.TFRecordDataset('x_test.tfrecord').batch(32) # for serialized_example in train_dataset: # for elem in serialized_example # example = tf.train.Example() # example.ParseFromString(elem) # x_1 = np.array(example.features.feature['X'].float_list.value) # y_1 = np.array(example.features.feature['Y'].float_list.value) # break # path = read_dataset(mode="train", tokenizer=tokenizer, max_seq_length=max_seq_length, fragmented=False) # import time t = time.time() log_name = "Salida_modelo_{}.txt".format(t) x, y = crear_batch(path, fragmented=False) entrada = { "questions_id": np.squeeze(x[:, 3]), "question_input_mask": np.squeeze(x[:, 4]), "question_segment_id": np.squeeze(x[:, 5]), "context_id": np.squeeze(x[:, 0]), "context_input_mask": np.squeeze(x[:, 1]), "context_segment_id": np.squeeze(x[:, 2]) } salida = [y[:, 0], y[:, 1]]
device = torch.cuda.current_device() max_seq_length = 350 model = build_model(max_seq_length) optimizer = torch.optim.Adam(model.parameters(), lr=0.00005, betas=(0.9, 0.98), eps=1e-9) model.cuda() criterion = negLogSum # Train data path = read_dataset(mode="train", dataset="squad", tokenizer=tokenizer, max_seq_length=max_seq_length, fragmented=False, framework="torch") x, y = crear_batch(path, fragmented=False) thing = list(map(list, zip(*x))) ids, mask = np.squeeze(np.array(thing[0])), np.squeeze(np.array(thing[1])) train_dataset = SquadDataset(ids, mask, y[:, 0], y[:, 1]) train_dataset.batch(32) # Test data path = read_dataset(mode="test", dataset="squad", tokenizer=tokenizer, max_seq_length=max_seq_length, fragmented=False,