Ejemplo n.º 1
0
 def Load_Each_Data(self, path=None, shuffle=False):
     assert path is not None, "The Data Path Is Not Allow Empty."
     insts = []
     with open(path, encoding="UTF-8") as f:
         inst = Instance()
         now_line = 0
         for line in f.readlines():
             now_line += 1
             sys.stdout.write("\rhandling with the {} line".format(now_line))
             line = line.strip()
             if line == "" and len(inst.words) != 0:
                 inst.words_size = len(inst.words)
                 insts.append(inst)
                 inst = Instance()
             else:
                 line = line.strip().split(" ")
                 # print(line)
                 assert len(line) == 3, "Error Format"
                 word = line[0]
                 # word = self.clean_str(word)
                 # if word == " ":
                 #     print("rrrrrrrrrrrrrr", word)
                 #     continue
                 # print("\n" + word)
                 inst.words.append(word.lower())
                 inst.labels.append(line[2])
             # if now_line == 36 * 16:
             #     break
         if len(inst.words) != 0:
             inst.words_size = len(inst.words)
             insts.append(inst)
         print("\n")
     return insts
 def _Load_Each_Data(self, path=None, shuffle=False):
     """
     :param path:
     :param shuffle:
     :return:
     """
     assert path is not None, "The Data Path Is Not Allow Empty."
     insts = []
     with open(path, encoding="UTF-8") as f:
         inst = Instance()
         for line in f.readlines():
             line = line.strip()
             if line == "" and len(inst.words) != 0:
                 inst.words_size = len(inst.words)
                 insts.append(inst)
                 inst = Instance()
             else:
                 line = line.strip().split(" ") ## line:['EU'. 'S-ORG']
                 word = line[0]
                 char = self._add_char(word)## char是个列表,是每个单词的字母组成
                 word = self._normalize_word(word)##返回的是一个字符串,就是把输入word字符串中的数字转为'0':engl7sh->engl0sh
                 inst.chars.append(char)## inst.chars本身是一个list,把现在这个单词对应的char列表加进去
                 inst.words.append(word)## inst.words本身是一个lsit,把修改之后的word加进去
                 inst.labels.append(line[-1])
             if len(insts) == self.max_count:## 控制读取数据量,一般设置为-1,也就是读取全部数据
                 break
         if len(inst.words) != 0:
             inst.words_size = len(inst.words)
             insts.append(inst)
         # print("\n")
     return insts
 def _Load_Each_Data(self, path=None, shuffle=False):
     """
     :param path:
     :param shuffle:
     :return:
     """
     assert path is not None, "The Data Path Is Not Allow Empty."
     insts = []
     with open(path, encoding="UTF-8") as f:
         inst = Instance()
         for line in f.readlines():
             line = line.strip()
             if line == "" and len(inst.words) != 0:
                 inst.words_size = len(inst.words)
                 insts.append(inst)
                 inst = Instance()
             else:
                 line = line.strip().split(" ")
                 word = line[0]
                 char = self._add_char(word)
                 word = self._normalize_word(word)
                 inst.chars.append(char)
                 inst.words.append(word)
                 inst.labels.append(line[-1])
             if len(insts) == self.max_count:
                 break
         if len(inst.words) != 0:
             inst.words_size = len(inst.words)
             insts.append(inst)
         # print("\n")
     return insts
Ejemplo n.º 4
0
    def Load_Each_Data(self, path=None, shuffle=False):
        assert path is not None, "The Data Path Is Not Allow Empty."
        insts = []
        with open(path, encoding="UTF-8") as f:
            inst = Instance()
            now_line = 0
            for line in f.readlines():
                now_line += 1
                sys.stdout.write("\rhandling with the {} line".format(now_line))
                line = line.strip()
                # print(line)
                if line == "" and len(inst.words) != 0:
                    inst.words_size = len(inst.words)
                    insts.append(inst)
                    inst = Instance()
                elif line == "":
                    continue
                else:
                    # line = self.clean_str(line)
                    line = line.strip().split(" ")
                    # print(line)
                    assert len(line) == 2, "Error Format"
                    # if len(line) != 2:
                    #     continue
                    word = line[0]
                    if word == "-DOCSTART-":
                        continue
                    # word = self.clean_conll(word)
                    # if word == "":
                    #     continue
                    # # if line[1] == "O":
                    # #     continue
                    # if (not word[0].isalpha()) and line[1][0] == "I":
                    #     continue
                    # if (not word[0].isalpha()) and line[1][0] == "O":
                    #     continue
                    inst.words.append(word.lower())
                    inst.labels.append(line[1])
                # if len(insts) == 2560:
                #     break
            if len(inst.words) != 0:
                inst.words_size = len(inst.words)
                insts.append(inst)
            print("\n")

        return insts
    def _Load_Each_JsonData(self, path=None, path_id=0, train=False):
        assert path is not None, "The Data Path Is Not Allow Empty."
        insts = []
        now_lines = 0
        # print()
        with open(path, encoding="UTF-8") as f:
            lines = f.readlines()
            for line in lines:
                now_lines += 1
                if now_lines % 2000 == 0:
                    sys.stdout.write(
                        "\rreading the {} line\t".format(now_lines))
                if line == "\n":
                    print("empty line")
                inst = Instance()
                line_json = json.loads(line)
                fact = line_json["fact"].split()[:self.max_train_len]
                bert_line = "".join(fact)

                # accu label
                accu = line_json["meta"]["accusation"]
                # print(accu)
                # law label
                law = line_json["meta"]["relevant_articles"]

                inst.words = fact
                inst.bert_line = bert_line[:self.bert_max_char_length]
                inst.accu_labels = accu
                inst.law_labels = law

                inst.words_size = len(inst.words)
                inst.accu_labels_size = len(inst.accu_labels)
                inst.law_labels_size = len(inst.law_labels)
                insts.append(inst)
                if len(insts) == self.max_count:
                    break
            sys.stdout.write("\rreading the {} line\t".format(now_lines))
        if self.use_bert:
            insts = self._read_bert_file(insts, path=self.bert_path[path_id])
        return insts
 def _Load_Each_Data(self, path=None, shuffle=False):
     """
     :param path:
     :param shuffle:
     :return:
     """
     #pinyin1 = Pinyin()
     assert path is not None, "The Data Path Is Not Allow Empty."
     insts = []
     now_lines = 0
     with open(path, encoding="UTF-8") as f:
         inst = Instance()
         data = [line.strip().split() for line in f]
         for line in data:
             #line = line.strip()
             inst = Instance()
             #line = line.split()
             label = line[0]
             #word = line[2:-1]   # No LDA
             word = line[2:]    # Have LDA
             #pinyin = pinyin1.get_pinyin(word," ", tone_marks= None)
             #word = " ".join(line[1:])
             inst.words = word
             inst.labels.append(label)
             inst.words_size = len(inst.words)
             #inst.pinyin = pinyin
             insts.append(inst)
             if len(insts) == self.max_count:
                 break
     '''
     得到原始的数据,可只用于test中
     或者将它写入txt 对比
     print('$$$$$$$$$$$$$$$$$')
     for k in insts:
         print(k.labels, '****', k.words)
     print('################')
     '''
     return insts
Ejemplo n.º 7
0
    def _Load_Each_Data(self, path=None, path_id=None):
        """
        :param path:
        :param shuffle:
        :return:
        """
        assert path is not None, "The Data Path Is Not Allow Empty."
        insts = []
        now_lines = 0
        with open(path, encoding="UTF-8") as f:
            inst = Instance()
            for line in f.readlines():
                line = line.strip()
                now_lines += 1
                if now_lines % 200 == 0:
                    sys.stdout.write("\rreading the {} line\t".format(now_lines))
                if line == "\n":
                    print("empty line")

                inst = Instance()
                line = line.split()
                label = line[0]
                word = " ".join(line[1:])
                if label not in ["0", "1"]:
                    print("Error line: ", " ".join(line))
                    continue
                inst.words = self._clean_str(word).split()
                inst.labels.append(label)
                inst.words_size = len(inst.words)
                insts.append(inst)

                if len(insts) == self.max_count:
                    break
            # print("\n")
        if self.use_bert:
            insts = self._read_bert_file(insts, path=self.bert_path[path_id])
        return insts
 def _Load_Each_Data(self, path=None, shuffle=False):
     """
     :param path:
     :param shuffle:
     :return:
     """
     assert path is not None, "The Data Path Is Not Allow Empty."
     insts = []
     with open(path, encoding="UTF-8") as f:
         lines = f.readlines()
         for index, line in enumerate(lines):
             # copy with "/n"
             line = unicodedata.normalize('NFKC', line.strip())
             # init instance
             inst = Instance()
             line = line.split(" ")
             inst.line = " ".join(line)
             # print(inst.line)
             # print(line)
             count = 0
             for word_pos in line:
                 # segment the word and pos in line
                 word, _, label = word_pos.partition("_")
                 word_length = len(word)
                 inst.words.append(word)
                 inst.gold_seg.append("[" + str(count) + "," +
                                      str(count + word_length) + "]")
                 inst.gold_pos.append("[" + str(count) + "," +
                                      str(count + word_length) + "]" +
                                      label)
                 count += word_length
                 for i in range(word_length):
                     char = word[i]
                     # print(char)
                     inst.chars.append(char)
                     if i == 0:
                         inst.gold.append(sep + "#" + label)
                         inst.pos.append(label)
                     else:
                         inst.gold.append(app)
             char_number = len(inst.chars)
             for i in range(char_number):
                 # copy with the left bichars
                 if i is 0:
                     inst.bichars_left.append(nullkey + inst.chars[i])
                 else:
                     inst.bichars_left.append(inst.chars[i - 1] +
                                              inst.chars[i])
                 # copy with the right bichars
                 if i == char_number - 1:
                     inst.bichars_right.append(inst.chars[i] + nullkey)
                 else:
                     inst.bichars_right.append(inst.chars[i] +
                                               inst.chars[i + 1])
             # char/word size
             inst.chars_size = len(inst.chars)
             inst.words_size = len(inst.words)
             inst.bichars_size = len(inst.bichars_left)
             inst.gold_size = len(inst.gold)
             # add one inst that represent one sentence into the list
             insts.append(inst)
             if len(insts) == self.max_count:
                 break
     return insts