Exemple #1
0
    i_leq_j_mask = tf.expand_dims(i_leq_j_mask ,0)
    logits  = tf.expand_dims(start_logits ,2 ) +tf.expand_dims(end_logits ,1 ) +cross
    logits -= 10000 *i_leq_j_mask
    logits  = tf.reshape(logits, [batch_size ,seq_length *seq_length])
    short_p = tf.nn.softmax(logits)
    indices = tf.argsort(short_p ,axis=1 ,direction='DESCENDING')[: ,:n_keep]
    short_p = tf.gather(short_p ,indices ,batch_dims=1)
    return dict(unique_id = unique_id,
                ans_logits= ans_logits,
                long_p    = long_p,
                short_p   = short_p,
                short_p_indices = indices)



test=read_dataset()
predictions={}

for tema in test:
    for cosa in tema["paragraphs"]:
        text=cosa["context"]
        for question in cosa["qas"]:
            unique_id=question["id"]
            text_tokens = tokenizer.tokenize(text)

            question_len=np.array([len(tokenizer.tokenize(question["question"]))])
            data_len = np.array([512])
            input_word_ids,input_mask,segment_id =convert_two_sentences_to_features(question["question"],text,tokenizer,512)


            c=[data_len.reshape(-1,),input_word_ids.reshape(1,-1), question_len.reshape(-1,)]
Exemple #2
0
 # writer.write(serialized_features_dataset)
 # writer.close()
 # filenames = [filename]
 # train_dataset=tf.data.TFRecordDataset('x_test.tfrecord').batch(32)
 # for serialized_example in train_dataset:
 #     for elem in serialized_example
 #         example = tf.train.Example()
 #         example.ParseFromString(elem)
 #         x_1 = np.array(example.features.feature['X'].float_list.value)
 #         y_1 = np.array(example.features.feature['Y'].float_list.value)
 #         break
 #
 #
 path = read_dataset(mode="train",
                     dataset="naturalq",
                     tokenizer=tokenizer,
                     max_seq_length=max_seq_length,
                     fragmented=False)
 #
 # import datetime
 # t = datetime.datetime.now().time()
 # log_name = "Salida_modelo_{}.txt".format(t)
 x, y = crear_batch(path, fragmented=False)
 N = len(x)
 entrada = {
     "questions_id": np.squeeze(x[:N, 3].astype(np.int32)),
     "question_input_mask": np.squeeze(x[:N, 4].astype(np.int32)),
     "question_segment_id": np.squeeze(x[:N, 5].astype(np.int32)),
     "context_id": np.squeeze(x[:N, 0].astype(np.int32)),
     "context_input_mask": np.squeeze(x[:N, 1].astype(np.int32)),
     "context_segment_id": np.squeeze(x[:N, 2].astype(np.int32))
Exemple #3
0
# writer = tf.data.experimental.TFRecordWriter(filename)
# writer.write(serialized_features_dataset)
# writer.close()
# filenames = [filename]
# train_dataset=tf.data.TFRecordDataset('x_test.tfrecord').batch(32)
# for serialized_example in train_dataset:
#     for elem in serialized_example
#         example = tf.train.Example()
#         example.ParseFromString(elem)
#         x_1 = np.array(example.features.feature['X'].float_list.value)
#         y_1 = np.array(example.features.feature['Y'].float_list.value)
#         break
#

path = read_dataset(mode="train",
                    tokenizer=tokenizer,
                    max_seq_length=max_seq_length,
                    fragmented=False)
#
import time
t = time.time()
log_name = "Salida_modelo_{}.txt".format(t)
x, y = crear_batch(path, fragmented=False)
entrada = {
    "questions_id": np.squeeze(x[:, 3]),
    "question_input_mask": np.squeeze(x[:, 4]),
    "question_segment_id": np.squeeze(x[:, 5]),
    "context_id": np.squeeze(x[:, 0]),
    "context_input_mask": np.squeeze(x[:, 1]),
    "context_segment_id": np.squeeze(x[:, 2])
}
salida = [y[:, 0], y[:, 1]]
Exemple #4
0
    device = torch.cuda.current_device()

    max_seq_length = 350
    model = build_model(max_seq_length)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=0.00005,
                                 betas=(0.9, 0.98),
                                 eps=1e-9)
    model.cuda()

    criterion = negLogSum

    # Train data
    path = read_dataset(mode="train",
                        dataset="squad",
                        tokenizer=tokenizer,
                        max_seq_length=max_seq_length,
                        fragmented=False,
                        framework="torch")
    x, y = crear_batch(path, fragmented=False)

    thing = list(map(list, zip(*x)))
    ids, mask = np.squeeze(np.array(thing[0])), np.squeeze(np.array(thing[1]))

    train_dataset = SquadDataset(ids, mask, y[:, 0], y[:, 1])
    train_dataset.batch(32)
    # Test data
    path = read_dataset(mode="test",
                        dataset="squad",
                        tokenizer=tokenizer,
                        max_seq_length=max_seq_length,
                        fragmented=False,