def _Load_Each_JsonData(self, path=None, path_id=0, train=False):
        assert path is not None, "The Data Path Is Not Allow Empty."
        insts = []
        now_lines = 0
        # print()
        with open(path, encoding="UTF-8") as f:
            lines = f.readlines()
            for line in lines:
                now_lines += 1
                if now_lines % 2000 == 0:
                    sys.stdout.write(
                        "\rreading the {} line\t".format(now_lines))
                if line == "\n":
                    print("empty line")
                inst = Instance()
                line_json = json.loads(line)
                fact = line_json["fact"].split()[:self.max_train_len]
                bert_line = "".join(fact)

                # accu label
                accu = line_json["meta"]["accusation"]
                # print(accu)
                # law label
                law = line_json["meta"]["relevant_articles"]

                inst.words = fact
                inst.bert_line = bert_line[:self.bert_max_char_length]
                inst.accu_labels = accu
                inst.law_labels = law

                inst.words_size = len(inst.words)
                inst.accu_labels_size = len(inst.accu_labels)
                inst.law_labels_size = len(inst.law_labels)
                insts.append(inst)
                if len(insts) == self.max_count:
                    break
            sys.stdout.write("\rreading the {} line\t".format(now_lines))
        if self.use_bert:
            insts = self._read_bert_file(insts, path=self.bert_path[path_id])
        return insts
 def _Load_Each_Data(self, path=None, shuffle=False):
     """
     :param path:
     :param shuffle:
     :return:
     """
     #pinyin1 = Pinyin()
     assert path is not None, "The Data Path Is Not Allow Empty."
     insts = []
     now_lines = 0
     with open(path, encoding="UTF-8") as f:
         inst = Instance()
         data = [line.strip().split() for line in f]
         for line in data:
             #line = line.strip()
             inst = Instance()
             #line = line.split()
             label = line[0]
             #word = line[2:-1]   # No LDA
             word = line[2:]    # Have LDA
             #pinyin = pinyin1.get_pinyin(word," ", tone_marks= None)
             #word = " ".join(line[1:])
             inst.words = word
             inst.labels.append(label)
             inst.words_size = len(inst.words)
             #inst.pinyin = pinyin
             insts.append(inst)
             if len(insts) == self.max_count:
                 break
     '''
     得到原始的数据,可只用于test中
     或者将它写入txt 对比
     print('$$$$$$$$$$$$$$$$$')
     for k in insts:
         print(k.labels, '****', k.words)
     print('################')
     '''
     return insts
Example #3
0
    def _Load_Each_Data(self, path=None, path_id=None):
        """
        :param path:
        :param shuffle:
        :return:
        """
        assert path is not None, "The Data Path Is Not Allow Empty."
        insts = []
        now_lines = 0
        with open(path, encoding="UTF-8") as f:
            inst = Instance()
            for line in f.readlines():
                line = line.strip()
                now_lines += 1
                if now_lines % 200 == 0:
                    sys.stdout.write("\rreading the {} line\t".format(now_lines))
                if line == "\n":
                    print("empty line")

                inst = Instance()
                line = line.split()
                label = line[0]
                word = " ".join(line[1:])
                if label not in ["0", "1"]:
                    print("Error line: ", " ".join(line))
                    continue
                inst.words = self._clean_str(word).split()
                inst.labels.append(label)
                inst.words_size = len(inst.words)
                insts.append(inst)

                if len(insts) == self.max_count:
                    break
            # print("\n")
        if self.use_bert:
            insts = self._read_bert_file(insts, path=self.bert_path[path_id])
        return insts