Ejemplo n.º 1
0
    def getTsvTestData(self, filepath, sep, sequence_length, y_value=False):
        """
        load the data(label, mention, entity) from labeled files
        :param filepath:
        :return:  three lists(label_list, mention_list, entity_list)
        """
        print("Loading testing/labelled data from " + filepath)
        x1, x2 = [], []
        y = []
        line_num = 0
        for line in codecs.open(filepath, "r", "utf-8"):
            line_num += 1
            l = line.strip().split(sep)
            if len(l) < 3:
                continue

            l[1] = preprocess_unit(l[1])
            l[2] = preprocess_unit(l[2])

            if len(l[1]) == 0 or len(l[2]) == 0:
                continue

            # truncate when length is bigger than the max_length
            if len(l[1]) > sequence_length or len(l[2]) > sequence_length:
                l[1] = l[1][:sequence_length]
                l[2] = l[2][:sequence_length]

            x1.append(l[1])
            x2.append(l[2])
            y = self.add_y_helper(y_value, y, int(l[0]) == 1)
            if line_num != len(y):
                print("ei")
        print(line_num)
        return np.asarray(x1), np.asarray(x2), np.asarray(y)
Ejemplo n.º 2
0
    def getTsvTestData_Mul_Labels_Dyna(self,
                                       filepath,
                                       sep,
                                       sequence_length,
                                       y_value=False):
        """
        load the data(label, mention, entity) from labeled mutlti-task files
        :param filepath:
        :return:  three lists(label_list, mention_list, entity_list)
        """
        print("Loading testing/labelled data from " + filepath)
        x1, x2, x3, x4 = [], [], [], []
        y = []
        y2 = []
        indicate = []
        for line in codecs.open(filepath, "r", "utf-8"):
            l = line.strip().split(sep)
            l[1] = preprocess_unit(l[1])
            l[2] = preprocess_unit(l[2])
            if len(l[1]) > sequence_length or len(l[2]) > sequence_length:
                l[1] = l[1][:sequence_length]
                l[2] = l[2][:sequence_length]
            x1.append(l[1])
            x2.append(l[2])
            y = self.add_y_helper(y_value, y, int(l[0]) == 1)

            if len(l) == 3:  # dynamic single task1
                x3.append("")
                x4.append("")
                y2 = self.add_y_helper(y_value, y2, False)
                indicate.append(1)
            else:
                l[4] = preprocess_unit(l[4])
                l[5] = preprocess_unit(l[5])
                # truncate when length is bigger than the max_length
                if len(l[4]) > sequence_length or len(l[5]) > sequence_length:
                    l[5] = l[5][:sequence_length]
                    l[4] = l[4][:sequence_length]
                x3.append(l[4])
                x4.append(l[5])
                indicate.append(0)
                y2 = self.add_y_helper(y_value, y2, int(l[3]) == 1)

        return indicate, np.asarray(x1), np.asarray(x2), np.asarray(
            x3), np.asarray(x4), np.asarray(y), np.asarray(y2)
Ejemplo n.º 3
0
    def getTsvData(self,
                   filepath,
                   sep,
                   max_record_entity,
                   sequence_length,
                   y_value=False):
        """
        load the data (label1, mention1, entity1)... (label22, mention22, entity22) from labeled files
        :param filepath:
        :return:  three lists(label_lists, mention_lists, entity_lists
        """

        print("Loading labelled data from " + filepath)
        label_lists = [0] * max_record_entity
        mention_lists = [0] * max_record_entity
        entity_lists = [0] * max_record_entity
        mask_lists = [0] * max_record_entity
        line_num = 0
        for line in codecs.open(filepath, "r", "utf-8"):
            line = line.strip().split(sep)
            if len(line) < max_record_entity * 3:
                continue

            #只取要的部分
            items = line[:(max_record_entity / 2) * 3]
            items.extend(line[11 * 3:11 * 3 + (max_record_entity / 2) * 3])

            # truncate when length is bigger than the max_length
            for index, item in enumerate(items):
                if index % 3 == 0:
                    content1_fixed = preprocess_unit(item)
                    content2_fixed = preprocess_unit(items[index + 1])
                    flag_empty = 0.0 if (content1_fixed == ''
                                         and content2_fixed == '') else 1.0
                    if len(content1_fixed) > sequence_length:
                        content1_fixed = content1_fixed[:sequence_length]
                    if len(content2_fixed) > sequence_length:
                        content2_fixed = content2_fixed[:sequence_length]
                    if line_num == 0:
                        entity_lists[index / 3] = [content1_fixed]
                        mention_lists[index / 3] = [content2_fixed]
                        mask_lists[index / 3] = [flag_empty]
                        if items[index + 2] == '1':
                            label_lists[index / 3] = [[1, 0]]
                        else:
                            label_lists[index /
                                        3] = [[0, 1]] if (flag_empty
                                                          == 1.0) else [[0, 0]]
                    else:
                        entity_lists[index / 3].append(
                            content1_fixed)  #entity,mention list是否需要调换顺序???
                        mention_lists[index / 3].append(content2_fixed)
                        mask_lists[index / 3].append(flag_empty)
                        if items[index + 2] == '1':
                            label_lists[index / 3].append([1, 0])
                        else:
                            if flag_empty == 1.0:
                                label_lists[index / 3].append([0, 1])
                            else:
                                label_lists[index / 3].append([0, 0])

            line_num += 1

        print('load records %d' % (line_num))
        return np.asarray(mention_lists), np.asarray(entity_lists), np.asarray(
            label_lists), np.asarray(mask_lists)