def test(self, sess, token_ids):
     # We decode one sentence at a time.
     token_ids = data_utils.padding(token_ids)
     target_ids = data_utils.padding([data_utils.GO_ID])
     y_ids = data_utils.padding([data_utils.EOS_ID])
     encoder_inputs, decoder_inputs, _ = data_utils.nextRandomBatch([(token_ids, target_ids, y_ids)], batch_size=1)
     prediction = sess.run(self.prediction, feed_dict={
         self.encoder_inputs: encoder_inputs,
         self.decoder_inputs: decoder_inputs
     })
     pred_max = tf.arg_max(prediction, 1)
     # prediction = tf.split(0, self.num_steps, prediction)
     # # This is a greedy decoder - outputs are just argmaxes of output_logits.
     # outputs = [int(np.argmax(predict)) for predict in prediction]
     # # If there is an EOS symbol in outputs, cut them at that point.
     # if data_utils.EOS_ID in outputs:
     #     outputs = outputs[:outputs.index(data_utils.EOS_ID)]
     return pred_max.eval()
Ejemplo n.º 2
0
def predict(word_raw):
    sentence = add_char_information([format_data(word_raw)])
    test_sent = padding(create_matrices(sentence, word2idx, label2idx,
                                        case2idx, char2idx))
    sent_batch, _ = create_batches(test_sent)
    tokens, casing, char, labels = sent_batch[0]
    tokens = np.asarray([tokens])
    casing = np.asarray([casing])
    char = np.asarray([char])
    pred = model.predict([tokens, casing, char], verbose=False)[0]
    pred = pred.argmax(axis=-1)
    pred_sent = list(zip(word_raw.split(), [idx2_label[i] for i in pred]))
    return pred_sent
Ejemplo n.º 3
0
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """
    length = padding_length(sentences)
    none_index = tag_to_id["O"]

    def f(x):
        return x.lower() if lower else x
    data = []
    for s in sentences:
        string = [w[0] for w in s]
        chars = [char_to_id[f(w) if f(w) in char_to_id else '<UNK>']
                 for w in string]
        if len(chars) > length:
            sequence_length = length
        else:
            sequence_length = len(chars)
        chars = padding(chars, length)
        segs = get_seg_features("".join(string))
        segs = padding(segs, length)
        if train:
            tags = [tag_to_id[w[-1]] for w in s]
        else:
            tags = [none_index for _ in chars]
        tags = padding(tags, length)
        data.append({
            "string": string,
            "chars": chars,
            "segs": segs,
            "tags": tags,
            "seqlen": sequence_length
        })

    return data
Ejemplo n.º 4
0
    def embed(self):
        """Create word- and character-level embeddings"""

        label_set = set()
        words = {}

        # unique words and labels in data
        for dataset in [self.train_sentences, self.dev_sentences, self.test_sentences]:
            for sentence in dataset:
                for token, char, label in sentence:
                    # token ... token, char ... list of chars, label ... BIO labels
                    label_set.add(label)
                    words[token.lower()] = True

        # mapping for labels
        self.label_to_idx = {}
        for label in label_set:
            self.label_to_idx[label] = len(self.label_to_idx)

        # mapping for token cases
        case_to_idx = {'numeric': 0, 'allLower': 1, 'allUpper': 2, 'initialUpper': 3, 'other': 4, 'mainly_numeric': 5,
                       'contains_digit': 6, 'PADDING_TOKEN': 7}
        self.case_embeddings = np.identity(len(case_to_idx), dtype='float32')  # identity matrix used

        # read GLoVE word embeddings
        word_to_idx = {}
        self.word_embeddings = []

        f_embeddings = open("data/glove.50d.txt", encoding="utf-8")

        # loop through each word in embeddings
        for line in f_embeddings:
            split = line.strip().split(" ")
            word = split[0]  # embedding word entry

            if len(word_to_idx) == 0:  # add padding+unknown
                word_to_idx["PADDING_TOKEN"] = len(word_to_idx)
                vector = np.zeros(len(split) - 1)  # zero vector for 'PADDING' word
                self.word_embeddings.append(vector)

                word_to_idx["UNKNOWN_TOKEN"] = len(word_to_idx)
                vector = np.random.uniform(-0.25, 0.25, len(split) - 1)
                self.word_embeddings.append(vector)

            if split[0].lower() in words:
                vector = np.array([float(num) for num in split[1:]])
                self.word_embeddings.append(vector)  # word embedding vector
                word_to_idx[split[0]] = len(word_to_idx)  # corresponding word dict

        self.word_embeddings = np.array(self.word_embeddings)

        # dictionary of all possible characters
        self.char_to_idx = {"PADDING": 0, "UNKNOWN": 1}
        for c in "’ỳ‘°fhXLẹủÀgÂếỒừHơý¼[êớ3BùỜnểỗPứỹAlâ+ÔẵÊ/.Ề-jÓ8CởVqĩẨk* " \
                 "òĐỆd4áỏệrUỐỪ>ỮóÐ]ễụRũ²ằự&ZồÕeẶuẽ0wố6ŨẢDSữẩọưQyèO)K³bắvãàÚạ?MÝÁỔỄÙìmặ27ƠỞửÍƯờỉầịĂềổậJđIpõỵẬộ~ôiY–9" \
                 "Ầð:FxG!a,5%(ísả…NWỨoỡTẫéú“ợEẻỲză\"ẤẠỷc;ấẳ1”ỰtỖỦ'":
            self.char_to_idx[c] = len(self.char_to_idx)

        def write(file_name, data):
            with open(file_name, "w") as f:
                json.dump(data, f)

        write("data/word.json", word_to_idx)
        write("data/label2idx.json", self.label_to_idx)
        write("data/case2idx.json", case_to_idx)
        write("data/char2idx.json", self.char_to_idx)
        # format: [[wordindices], [caseindices], [padded word indices], [label indices]]
        self.train_set = padding(
            create_matrices(self.train_sentences, word_to_idx, self.label_to_idx, case_to_idx, self.char_to_idx))
        self.dev_set = padding(
            create_matrices(self.dev_sentences, word_to_idx, self.label_to_idx, case_to_idx, self.char_to_idx))
        self.test_set = padding(
            create_matrices(self.test_sentences, word_to_idx, self.label_to_idx, case_to_idx, self.char_to_idx))

        self.idx_to_label = {v: k for k, v in self.label_to_idx.items()}