Ejemplo n.º 1
0
    def __create_xy(self, dependency_tree, embedding_file, data_size, look_back, test=False):
        sentences, words, tags = DataUtils.parse_dependency_tree(dependency_tree)
        word_vectors = DataUtils.create_onehot_vectors(words)
        #word_int = DataUtils.create_int_dict(words)
        word_emb = DataUtils.load_embeddings(embedding_file)
        tag_int = DataUtils.create_int_dict(tags)

        data_size = int(len(sentences)*min(data_size, 1))

        if test:
            sentences.reverse()

        if look_back == 0:
            for sentence in sentences[:data_size]:
                look_back = max(look_back, len(sentence))

        self.look_back = look_back
        self.distinct_words = len(words)
        self.distinct_tags = len(tags)

        word_input_forward = []
        word_input_backward = []
        word_head_forward = []
        word_head_backward = []

        tag_input_forward = []
        tag_input_backward = []
        tag_head_forward = []
        tag_head_backward = []

        probability = []

        progress = 0

        for sentence in sentences[:data_size]:
            parts = [sentence[i:i+look_back] for i in range(0,len(sentence),look_back)]
            for part in parts:
                word_temp = np.zeros((2,look_back,300))
                tag_temp = np.zeros((2,look_back,),dtype="int32")
                prob_temp = np.zeros((look_back,),dtype="float32")

                for idx in range(len(part)):
                    word = part[idx]["word"]
                    word_temp[0][look_back-len(part)+idx] = word_emb[word] if word in word_emb else word_emb["UNK"]
                    word_temp[1][look_back-idx-1] = word_emb[word] if word in word_emb else word_emb["UNK"]

                    tag = part[idx]["tag"]
                    tag_temp[0][look_back-len(part)+idx] = tag_int[tag]
                    tag_temp[1][look_back-idx-1] = tag_int[tag]

                word_instance = np.zeros((len(part),2,look_back,300))
                tag_instance = np.zeros((len(part),2,look_back,),dtype="int32")

                head_instance = np.zeros((look_back,1), dtype="float32")

                for idx in range(len(part)):
                    word_instance[idx][0][look_back-idx-1:] = word_temp[0][look_back-len(part):look_back-len(part)+idx+1]
                    word_instance[idx][1][look_back-len(part)+idx:] = word_temp[1][look_back-len(part):look_back-idx]

                    tag_instance[idx][0][look_back-idx-1:] = tag_temp[0][look_back-len(part):look_back-len(part)+idx+1]
                    tag_instance[idx][1][look_back-len(part)+idx:] = tag_temp[1][look_back-len(part):look_back-idx]

                for idx in range(len(part)):
                    word_input = np.zeros((2,2,look_back,300))
                    tag_input = np.zeros((2,2,look_back,),dtype="int32")
                    prob_temp = 0.0

                    for jdx in range(len(part)):
                        if idx != jdx:
                            if part[idx]["head"] == part[jdx]["word"]:
                                prob_temp = 1.0
                            word_input[0] = word_instance[idx]
                            tag_input[0] = tag_instance[idx]
                            word_input[1] = word_instance[jdx]
                            tag_input[1] = tag_instance[jdx]

                    if len(word_input_forward) == 0:
                        word_input_forward = [word_input[0][0]]
                        word_input_backward = [word_input[0][1]]
                        word_head_forward = [word_instance[1][0]]
                        word_head_backward = [word_instance[1][1]]

                        tag_input_forward = [tag_input[0][0]]
                        tag_input_backward = [tag_input[0][1]]
                        tag_head_forward = [tag_input[1][0]]
                        tag_head_backward = [tag_input[1][1]]

                        probability = [prob_temp]
                    else:
                        word_input_forward = np.append(word_input_forward,[word_input[0][0]], axis=0)
                        word_input_backward = np.append(word_input_backward,[word_input[0][1]], axis=0)
                        word_head_forward = np.append(word_head_forward,[word_instance[1][0]], axis=0)
                        word_head_backward = np.append(word_head_backward,[word_instance[1][1]], axis=0)

                        tag_input_forward = np.append(tag_input_forward,[tag_input[0][0]], axis=0)
                        tag_input_backward = np.append(tag_input_backward,[tag_input[0][1]], axis=0)
                        tag_head_forward = np.append(tag_head_forward,[tag_input[1][0]], axis=0)
                        tag_head_backward = np.append(tag_head_backward,[tag_input[1][1]], axis=0)

                        probability = np.append(probability, [prob_temp], axis=0)

            DataUtils.update_message(str(progress)+"/"+str(data_size))
            progress += 1

        word_data = [(word_input_forward, word_input_backward), (word_head_forward, word_head_backward)]
        tag_data = [(tag_input_forward, tag_input_backward), (tag_head_forward, tag_head_backward)]

        return word_data, tag_data, probability
Ejemplo n.º 2
0
    def __create_xy(self, parse_tree_file, data_size, seq_len, test=False):
        sentences, words, tags = DataUtils.parse_dependency_tree(
            parse_tree_file)
        word_int = DataUtils.create_int_dict(words)
        tag_int = DataUtils.create_onehot_vectors(tags)

        self.seq_len = seq_len
        self.distinct_words = len(words)
        self.distinct_tags = len(tags)

        data_len = 0
        for i in range(len(sentences)):
            data_len += int(np.ceil(
                len(sentences[i]) / seq_len)) * seq_len * seq_len

        forward = np.zeros((
            2,
            data_len,
            seq_len,
        ), dtype="int32")
        backward = np.zeros((
            2,
            data_len,
            seq_len,
        ), dtype="int32")
        probability = np.zeros((data_len, ), dtype="float32")
        tags = np.zeros((data_len, 18))

        idx = 0
        for sentence in sentences:
            parts = [
                sentence[i:i + seq_len]
                for i in range(0, len(sentence), seq_len)
            ]
            for part in parts:
                part_len = len(part)
                word_forward = np.zeros((seq_len, seq_len), dtype="int32")
                word_backward = np.zeros((seq_len, seq_len), dtype="int32")

                for jdx in range(part_len):
                    word_forward[jdx][seq_len - jdx - 1:] = [
                        word_int[part[i]["word"]] for i in range(jdx + 1)
                    ]
                    word_backward[jdx][seq_len - part_len + jdx:] = [
                        word_int[part[part_len - i - 1]["word"]]
                        for i in range(part_len - jdx)
                    ]

                for jdx in range(part_len):
                    for zdx in range(part_len):
                        tags[idx] = tag_int[part[jdx]["tag"]]
                        forward[0][idx] = word_forward[jdx]
                        forward[1][idx] = word_forward[zdx]
                        backward[0][idx] = word_backward[jdx]
                        backward[1][idx] = word_backward[zdx]
                        probability[idx] = 1.0 if part[jdx]["head"] == part[
                            zdx]["word"] else 0.0
                        idx += 1

                        if idx % int(data_len / 100) == 0:
                            DataUtils.update_message(
                                str(int(idx / data_len * 100)))
        if test:
            forward = [
                np.array(forward[0][5000:10000]),
                np.array(forward[1][5000:10000])
            ]
            backward = [
                np.array(backward[0][5000:10000]),
                np.array(backward[1][5000:10000])
            ]
            probability = np.array(probability[5000:10000])
            tags = np.array(tags[5000:10000])
        else:
            forward = [
                np.array(forward[0][:5000]),
                np.array(forward[1][:5000])
            ]
            backward = [
                np.array(backward[0][:5000]),
                np.array(backward[1][:5000])
            ]
            probability = np.array(probability[:5000])
            tags = np.array(tags[:5000])

        return [forward[0], backward[0], forward[1],
                backward[1]], [tags, probability]
Ejemplo n.º 3
0
    def __create_xy(self, embedding_file, data_size, look_back, test=False):
        sentences, words, tags = DataUtils.parse_dependency_tree(self.language)
        word_vectors = DataUtils.create_onehot_vectors(words)
        #word_int = DataUtils.create_int_dict(words)
        word_emb = None
        if self.language == "turkish":
            word_emb = DataUtils.load_embeddings(embedding_file, "fasttext")
        else:
            word_emb = DataUtils.load_embeddings(embedding_file)
        tag_int = DataUtils.create_int_dict(tags)

        data_size = int(len(sentences) * min(data_size, 1))

        if test:
            sentences.reverse()

        if look_back == 0:
            for sentence in sentences[:data_size]:
                look_back = max(look_back, len(sentence))

        self.look_back = look_back
        self.distinct_words = len(words)
        self.distinct_tags = len(tags)

        word_full_forward = []
        word_full_backward = []
        word_instance_forward = []
        word_instance_backward = []

        tag_full_forward = []
        tag_full_backward = []
        tag_instance_forward = []
        tag_instance_backward = []

        head = []

        progress = 0

        for sentence in sentences[:data_size]:
            parts = [
                sentence[i:i + look_back]
                for i in range(0, len(sentence), look_back)
            ]
            for part in parts:
                word_temp = np.zeros((2, look_back, 300))
                tag_temp = np.zeros((
                    2,
                    look_back,
                ), dtype="int32")

                head_instance = np.zeros((look_back, 1), dtype="float32")

                for idx in range(len(part)):
                    word = part[idx]["word"]
                    word_temp[0][look_back - len(part) + idx] = word_emb[
                        word] if word in word_emb else word_emb["UNK"]
                    word_temp[1][look_back - idx - 1] = word_emb[
                        word] if word in word_emb else word_emb["UNK"]

                    tag = part[idx]["tag"]
                    tag_temp[0][look_back - len(part) + idx] = tag_int[tag]
                    tag_temp[1][look_back - idx - 1] = tag_int[tag]

                    word_instance = np.zeros((2, look_back, 300))
                    tag_instance = np.zeros((
                        2,
                        look_back,
                    ), dtype="int32")

                    for jdx in range(len(part)):
                        word_instance[0][look_back - jdx - 1:] = word_temp[
                            0][look_back - len(part):look_back - len(part) +
                               jdx + 1]
                        word_instance[1][look_back - len(part) +
                                         jdx:] = word_temp[1][look_back -
                                                              len(part
                                                                  ):look_back -
                                                              jdx]

                        tag_instance[0][look_back - jdx -
                                        1:] = tag_temp[0][look_back -
                                                          len(part):look_back -
                                                          len(part) + jdx + 1]
                        tag_instance[1][look_back - len(part) +
                                        jdx:] = tag_temp[1][look_back -
                                                            len(part
                                                                ):look_back -
                                                            jdx]

                        head_instance = np.zeros((look_back, 1),
                                                 dtype="float32")

                        for zdx in range(len(part)):
                            head_instance[zdx] = 1 if part[jdx][
                                "head"] == part[zdx]["word"] else 0
                        if len(word_full_forward) == 0:
                            word_full_forward = [word_temp[0]]
                            word_full_backward = [word_temp[1]]
                            word_instance_forward = [word_instance[0]]
                            word_instance_backward = [word_instance[1]]

                            tag_full_forward = [tag_temp[0]]
                            tag_full_backward = [tag_temp[1]]
                            tag_instance_forward = [tag_instance[0]]
                            tag_instance_backward = [tag_instance[1]]

                            head = [head_instance]
                        else:
                            word_full_forward = np.append(word_full_forward,
                                                          [word_temp[0]],
                                                          axis=0)
                            word_full_backward = np.append(word_full_backward,
                                                           [word_temp[1]],
                                                           axis=0)
                            word_instance_forward = np.append(
                                word_instance_forward, [word_instance[0]],
                                axis=0)
                            word_instance_backward = np.append(
                                word_instance_backward, [word_instance[1]],
                                axis=0)

                            tag_full_forward = np.append(tag_full_forward,
                                                         [tag_temp[0]],
                                                         axis=0)
                            tag_full_backward = np.append(tag_full_backward,
                                                          [tag_temp[1]],
                                                          axis=0)
                            tag_instance_forward = np.append(
                                tag_instance_forward, [tag_instance[0]],
                                axis=0)
                            tag_instance_backward = np.append(
                                tag_instance_backward, [tag_instance[1]],
                                axis=0)

                            head = np.append(head, [head_instance], axis=0)

            DataUtils.update_message(str(progress) + "/" + str(data_size))
            progress += 1

        word_data = [(word_full_forward, word_full_backward),
                     (word_instance_forward, word_instance_backward)]
        tag_data = [(tag_full_forward, tag_full_backward),
                    (tag_instance_forward, tag_instance_backward)]

        print(word_full_forward.shape, word_instance_forward.shape, head.shape)

        return word_data, tag_data, head
Ejemplo n.º 4
0
    def __create_xy(self,
                    dependency_tree,
                    embedding_file,
                    data_size,
                    look_back,
                    test=False):
        sentences, words, tags = DataUtils.parse_dependency_tree(
            dependency_tree)
        word_vectors = DataUtils.create_onehot_vectors(words)
        word_emb = DataUtils.load_embeddings(embedding_file)
        tag_int = DataUtils.create_int_dict(tags)

        data_size = int(len(sentences) * min(data_size, 1))

        if test:
            sentences.reverse()

        if look_back == 0:
            for sentence in sentences[:data_size]:
                look_back = max(look_back, len(sentence))

        self.look_back = look_back
        self.distinct_words = len(words)
        self.distinct_tags = len(tags)

        word_data = []
        head_data = []
        tag_data = []

        progress = 0
        for sentence in sentences[:data_size]:
            word_timestep = np.zeros((look_back, 300))
            head_timestep = np.zeros((look_back, len(words)))
            tag_timestep = np.zeros((look_back, ), dtype="int32")

            timestep = 0
            for element in sentence:
                word = element["word"]

                if word != "ROOT":
                    word_timestep[timestep % look_back] = word_emb[
                        word] if word in word_emb else word_emb["UNK"]

                    head = element["head"]
                    head_timestep[timestep % look_back] = word_vectors[head]

                    tag = element["tag"]
                    tag_timestep[timestep % look_back] = tag_int[tag]

                timestep += 1

                if timestep % look_back == 0 or timestep == len(sentence):
                    if len(word_data) == 0:
                        word_data = [word_timestep]
                        head_data = [head_timestep]
                        tag_data = [tag_timestep]
                    else:
                        word_data = np.append(word_data, [word_timestep],
                                              axis=0)
                        head_data = np.append(head_data, [head_timestep],
                                              axis=0)
                        tag_data = np.append(tag_data, [tag_timestep], axis=0)

                    word_timestep.fill(0)
                    head_timestep.fill(0)
                    tag_timestep.fill(0)

            DataUtils.update_message(str(progress) + "/" + str(data_size))
            progress += 1

        word_data = np.array(word_data)
        head_data = np.array(head_data)
        tag_data = np.array(tag_data)

        return word_data, head_data, tag_data